LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89
90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92
93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94
95 const SIRegisterInfo *TRI = STI.getRegisterInfo();
96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97
98 addRegisterClass(MVT::f64, V64RegClass);
99 addRegisterClass(MVT::v2f32, V64RegClass);
100 addRegisterClass(MVT::Untyped, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
230
231 // We only need to custom lower because we can't specify an action for bf16
232 // sources.
235 }
236
237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253
254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261
262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263
267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268
269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273
275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282 Expand);
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287 Expand);
288
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
292 Custom);
293
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297
299
301
303 Expand);
304
305#if 0
307#endif
308
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
311 for (MVT VT :
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321 switch (Op) {
322 case ISD::LOAD:
323 case ISD::STORE:
325 case ISD::BITCAST:
326 case ISD::UNDEF:
330 case ISD::IS_FPCLASS:
331 break;
336 break;
337 default:
339 break;
340 }
341 }
342 }
343
345
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
348
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363 }
364
365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377 }
378
379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391 }
392
393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405 }
406
407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419 }
420
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
423 Expand);
424
425 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
426 Custom);
427
428 // Avoid stack access for these.
429 // TODO: Generalize to more vector types.
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
433 Custom);
434
435 // Deal with vec3 vector operations when widened to vec4.
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
438
439 // Deal with vec5/6/7 vector operations when widened to vec8.
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
445 Custom);
446
447 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
448 // and output demarshalling
449 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
450
451 // We can't return success/failure, only the old value,
452 // let LLVM add the comparison
454 Expand);
455
456 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
457
458 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
459
460 // FIXME: This should be narrowed to i32, but that only happens if i64 is
461 // illegal.
462 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
463 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
464
465 // On SI this is s_memtime and s_memrealtime on VI.
467
468 if (Subtarget->hasSMemRealTime() ||
472
473 if (Subtarget->has16BitInsts()) {
476 } else {
478 }
479
480 if (Subtarget->hasMadMacF32Insts())
482
483 if (!Subtarget->hasBFI())
484 // fcopysign can be done in a single instruction with BFI.
485 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
486
487 if (!Subtarget->hasBCNT(32))
489
490 if (!Subtarget->hasBCNT(64))
492
493 if (Subtarget->hasFFBH())
495
496 if (Subtarget->hasFFBL())
498
499 // We only really have 32-bit BFE instructions (and 16-bit on VI).
500 //
501 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
502 // effort to match them now. We want this to be false for i64 cases when the
503 // extraction isn't restricted to the upper or lower half. Ideally we would
504 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
505 // span the midpoint are probably relatively rare, so don't worry about them
506 // for now.
507 if (Subtarget->hasBFE())
509
510 // Clamp modifier on add/sub
511 if (Subtarget->hasIntClamp())
513
514 if (Subtarget->hasAddNoCarry())
515 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
516 Legal);
517
518 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
519 Custom);
520
521 // These are really only legal for ieee_mode functions. We should be avoiding
522 // them for functions that don't have ieee_mode enabled, so just say they are
523 // legal.
525 {MVT::f32, MVT::f64}, Legal);
526
527 if (Subtarget->haveRoundOpsF64())
529 Legal);
530 else
532 MVT::f64, Custom);
533
535 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
536 Legal);
537 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
538
541
542 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
543 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544
545 // Custom lower these because we can't specify a rule based on an illegal
546 // source bf16.
549
550 if (Subtarget->has16BitInsts()) {
553 MVT::i16, Legal);
554
555 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
556
558 MVT::i16, Expand);
559
563 ISD::CTPOP},
564 MVT::i16, Promote);
565
567
568 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
569
571 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
573 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
574
578
580
581 // F16 - Constant Actions.
584
585 // F16 - Load/Store Actions.
587 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
589 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
590
591 // BF16 - Load/Store Actions.
593 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
595 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
596
597 // F16 - VOP1 Actions.
600 MVT::f16, Custom);
601
604
605 // F16 - VOP2 Actions.
606 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
607 Expand);
611
612 // F16 - VOP3 Actions.
614 if (STI.hasMadF16())
616
617 for (MVT VT :
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
621 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
622 switch (Op) {
623 case ISD::LOAD:
624 case ISD::STORE:
626 case ISD::BITCAST:
627 case ISD::UNDEF:
632 case ISD::IS_FPCLASS:
633 break;
637 break;
638 default:
640 break;
641 }
642 }
643 }
644
645 // v_perm_b32 can handle either of these.
646 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
648
649 // XXX - Do these do anything? Vector constants turn into build_vector.
650 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
651
652 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
653 Legal);
654
656 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
658 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
659
661 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
664
665 setOperationAction(ISD::AND, MVT::v2i16, Promote);
666 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
667 setOperationAction(ISD::OR, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
671
673 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
675 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
676 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
677 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
678
680 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
682 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
684 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
685
687 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
689 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
690 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
691 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
692
694 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
696 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
701 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
703 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
704
705 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
707 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
711
713 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
715 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
716 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
718
719 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
720 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
721 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
725
727 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
729 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
730 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
732
734 MVT::v2i32, Expand);
736
738 MVT::v4i32, Expand);
739
741 MVT::v8i32, Expand);
742
743 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
744 Subtarget->hasVOP3PInsts() ? Legal : Custom);
745
746 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
747 // This isn't really legal, but this avoids the legalizer unrolling it (and
748 // allows matching fneg (fabs x) patterns)
749 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
750
753
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
788 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
789 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
790 Custom);
791
792 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
793 // Split vector operations.
798 VT, Custom);
799
800 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
801 // Split vector operations.
803 VT, Custom);
804
805 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
806 Custom);
807
808 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
809 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
810 Custom);
811
812 if (Subtarget->hasPackedFP32Ops()) {
814 MVT::v2f32, Legal);
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
817 Custom);
818 }
819 }
820
822
823 if (Subtarget->has16BitInsts()) {
825 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
827 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
828 } else {
829 // Legalization hack.
830 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
831
833 }
834
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
840 Custom);
841
843
844 if (Subtarget->hasScalarSMulU64())
846
847 if (Subtarget->hasMad64_32())
849
850 if (Subtarget->hasPrefetch())
852
853 if (Subtarget->hasIEEEMinMax()) {
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
858 Custom);
859 } else {
860 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
861 if (Subtarget->hasMinimum3Maximum3F32())
863
864 if (Subtarget->hasMinimum3Maximum3PKF16())
866 }
867
869 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
870 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
871 MVT::i8},
872 Custom);
873
875 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
876 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
877 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
878 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
879 Custom);
880
882 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
883 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
884 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
885 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
886 Custom);
887
893
894 // TODO: Could move this to custom lowering, could benefit from combines on
895 // extract of relevant bits.
897
899
900 if (Subtarget->hasBF16ConversionInsts()) {
904 }
905
906 if (Subtarget->hasCvtPkF16F32Inst()) {
908 }
909
912 ISD::SUB,
914 ISD::MUL,
915 ISD::FADD,
916 ISD::FSUB,
917 ISD::FDIV,
918 ISD::FMUL,
925 ISD::FMA,
926 ISD::SMIN,
927 ISD::SMAX,
928 ISD::UMIN,
929 ISD::UMAX,
932 ISD::SMIN,
933 ISD::SMAX,
934 ISD::UMIN,
935 ISD::UMAX,
936 ISD::AND,
937 ISD::OR,
938 ISD::XOR,
939 ISD::SHL,
940 ISD::SRL,
941 ISD::SRA,
942 ISD::FSHR,
952
953 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
955
956 // All memory operations. Some folding on the pointer operand is done to help
957 // matching the constant offsets in the addressing modes.
982
983 // FIXME: In other contexts we pretend this is a per-function property.
985
987}
988
989const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
990
992 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
993 return RCRegs;
994}
995
996//===----------------------------------------------------------------------===//
997// TargetLowering queries
998//===----------------------------------------------------------------------===//
999
1000// v_mad_mix* support a conversion from f16 to f32.
1001//
1002// There is only one special case when denormals are enabled we don't currently,
1003// where this is OK to use.
1004bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1005 EVT DestVT, EVT SrcVT) const {
1006 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1007 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1008 DestVT.getScalarType() == MVT::f32 &&
1009 SrcVT.getScalarType() == MVT::f16 &&
1010 // TODO: This probably only requires no input flushing?
1012}
1013
1015 LLT DestTy, LLT SrcTy) const {
1016 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1017 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1018 DestTy.getScalarSizeInBits() == 32 &&
1019 SrcTy.getScalarSizeInBits() == 16 &&
1020 // TODO: This probably only requires no input flushing?
1021 denormalModeIsFlushAllF32(*MI.getMF());
1022}
1023
1025 // SI has some legal vector types, but no legal vector operations. Say no
1026 // shuffles are legal in order to prefer scalarizing some vector operations.
1027 return false;
1028}
1029
1032 EVT VT) const {
1035
1036 if (VT.isVector()) {
1037 EVT ScalarVT = VT.getScalarType();
1038 unsigned Size = ScalarVT.getSizeInBits();
1039 if (Size == 16) {
1040 if (Subtarget->has16BitInsts()) {
1041 if (VT.isInteger())
1042 return MVT::v2i16;
1043 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1044 }
1045 return VT.isInteger() ? MVT::i32 : MVT::f32;
1046 }
1047
1048 if (Size < 16)
1049 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1050 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1051 }
1052
1053 if (VT.getSizeInBits() > 32)
1054 return MVT::i32;
1055
1057}
1058
1061 EVT VT) const {
1064
1065 if (VT.isVector()) {
1066 unsigned NumElts = VT.getVectorNumElements();
1067 EVT ScalarVT = VT.getScalarType();
1068 unsigned Size = ScalarVT.getSizeInBits();
1069
1070 // FIXME: Should probably promote 8-bit vectors to i16.
1071 if (Size == 16 && Subtarget->has16BitInsts())
1072 return (NumElts + 1) / 2;
1073
1074 if (Size <= 32)
1075 return NumElts;
1076
1077 if (Size > 32)
1078 return NumElts * ((Size + 31) / 32);
1079 } else if (VT.getSizeInBits() > 32)
1080 return (VT.getSizeInBits() + 31) / 32;
1081
1083}
1084
1086 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1087 unsigned &NumIntermediates, MVT &RegisterVT) const {
1088 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1089 unsigned NumElts = VT.getVectorNumElements();
1090 EVT ScalarVT = VT.getScalarType();
1091 unsigned Size = ScalarVT.getSizeInBits();
1092 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1093 // support, but unless we can properly handle 3-vectors, it will be still be
1094 // inconsistent.
1095 if (Size == 16 && Subtarget->has16BitInsts()) {
1096 if (ScalarVT == MVT::bf16) {
1097 RegisterVT = MVT::i32;
1098 IntermediateVT = MVT::v2bf16;
1099 } else {
1100 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1101 IntermediateVT = RegisterVT;
1102 }
1103 NumIntermediates = (NumElts + 1) / 2;
1104 return NumIntermediates;
1105 }
1106
1107 if (Size == 32) {
1108 RegisterVT = ScalarVT.getSimpleVT();
1109 IntermediateVT = RegisterVT;
1110 NumIntermediates = NumElts;
1111 return NumIntermediates;
1112 }
1113
1114 if (Size < 16 && Subtarget->has16BitInsts()) {
1115 // FIXME: Should probably form v2i16 pieces
1116 RegisterVT = MVT::i16;
1117 IntermediateVT = ScalarVT;
1118 NumIntermediates = NumElts;
1119 return NumIntermediates;
1120 }
1121
1122 if (Size != 16 && Size <= 32) {
1123 RegisterVT = MVT::i32;
1124 IntermediateVT = ScalarVT;
1125 NumIntermediates = NumElts;
1126 return NumIntermediates;
1127 }
1128
1129 if (Size > 32) {
1130 RegisterVT = MVT::i32;
1131 IntermediateVT = RegisterVT;
1132 NumIntermediates = NumElts * ((Size + 31) / 32);
1133 return NumIntermediates;
1134 }
1135 }
1136
1138 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1139}
1140
1142 const DataLayout &DL, Type *Ty,
1143 unsigned MaxNumLanes) {
1144 assert(MaxNumLanes != 0);
1145
1146 LLVMContext &Ctx = Ty->getContext();
1147 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1148 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1149 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1150 NumElts);
1151 }
1152
1153 return TLI.getValueType(DL, Ty);
1154}
1155
1156// Peek through TFE struct returns to only use the data size.
1158 const DataLayout &DL, Type *Ty,
1159 unsigned MaxNumLanes) {
1160 auto *ST = dyn_cast<StructType>(Ty);
1161 if (!ST)
1162 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1163
1164 // TFE intrinsics return an aggregate type.
1165 assert(ST->getNumContainedTypes() == 2 &&
1166 ST->getContainedType(1)->isIntegerTy(32));
1167 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1168}
1169
1170/// Map address space 7 to MVT::v5i32 because that's its in-memory
1171/// representation. This return value is vector-typed because there is no
1172/// MVT::i160 and it is not clear if one can be added. While this could
1173/// cause issues during codegen, these address space 7 pointers will be
1174/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1175/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1176/// modeling, to work.
1178 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1179 return MVT::v5i32;
1181 DL.getPointerSizeInBits(AS) == 192)
1182 return MVT::v6i32;
1184}
1185/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1186/// v8i32 when padding is added.
1187/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1188/// also v8i32 with padding.
1190 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1191 DL.getPointerSizeInBits(AS) == 160) ||
1193 DL.getPointerSizeInBits(AS) == 192))
1194 return MVT::v8i32;
1196}
1197
1199 const CallInst &CI,
1200 MachineFunction &MF,
1201 unsigned IntrID) const {
1203 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1205 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1207 Info.flags |= getTargetMMOFlags(CI);
1208
1209 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1211 AttributeList Attr =
1213 MemoryEffects ME = Attr.getMemoryEffects();
1214 if (ME.doesNotAccessMemory())
1215 return false;
1216
1217 // TODO: Should images get their own address space?
1218 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1219
1220 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1221 if (RsrcIntr->IsImage) {
1224 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1225 Info.align.reset();
1226 }
1227
1228 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1229 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1230 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1231 // We conservatively set the memory operand of a buffer intrinsic to the
1232 // base resource pointer, so that we can access alias information about
1233 // those pointers. Cases like "this points at the same value
1234 // but with a different offset" are handled in
1235 // areMemAccessesTriviallyDisjoint.
1236 Info.ptrVal = RsrcArg;
1237 }
1238
1239 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1240 if (!IsSPrefetch) {
1241 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1242 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1244 }
1245
1247 if (ME.onlyReadsMemory()) {
1248 if (RsrcIntr->IsImage) {
1249 unsigned MaxNumLanes = 4;
1250
1251 if (!BaseOpcode->Gather4) {
1252 // If this isn't a gather, we may have excess loaded elements in the
1253 // IR type. Check the dmask for the real number of elements loaded.
1254 unsigned DMask =
1255 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1256 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1257 }
1258
1259 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1260 CI.getType(), MaxNumLanes);
1261 } else {
1262 Info.memVT =
1264 std::numeric_limits<unsigned>::max());
1265 }
1266
1267 // FIXME: What does alignment mean for an image?
1270 } else if (ME.onlyWritesMemory()) {
1272
1273 Type *DataTy = CI.getArgOperand(0)->getType();
1274 if (RsrcIntr->IsImage) {
1275 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1276 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1277 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1278 DMaskLanes);
1279 } else
1280 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1281
1283 } else {
1284 // Atomic, NoReturn Sampler or prefetch
1287 Info.flags |=
1289
1290 if (!IsSPrefetch)
1292
1293 switch (IntrID) {
1294 default:
1295 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1296 // Fake memory access type for no return sampler intrinsics
1297 Info.memVT = MVT::i32;
1298 } else {
1299 // XXX - Should this be volatile without known ordering?
1301 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1302 }
1303 break;
1304 case Intrinsic::amdgcn_raw_buffer_load_lds:
1305 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1306 case Intrinsic::amdgcn_struct_buffer_load_lds:
1307 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1308 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1309 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1310 Info.ptrVal = CI.getArgOperand(1);
1311 return true;
1312 }
1313 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1314 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1315 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1316 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1317 Info.memVT =
1319 std::numeric_limits<unsigned>::max());
1320 Info.flags &= ~MachineMemOperand::MOStore;
1321 return true;
1322 }
1323 }
1324 }
1325 return true;
1326 }
1327
1328 switch (IntrID) {
1329 case Intrinsic::amdgcn_ds_ordered_add:
1330 case Intrinsic::amdgcn_ds_ordered_swap: {
1332 Info.memVT = MVT::getVT(CI.getType());
1333 Info.ptrVal = CI.getOperand(0);
1334 Info.align.reset();
1336
1337 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1338 if (!Vol->isZero())
1340
1341 return true;
1342 }
1343 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1344 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1346 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1347 Info.ptrVal = nullptr;
1348 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1350 return true;
1351 }
1352 case Intrinsic::amdgcn_ds_append:
1353 case Intrinsic::amdgcn_ds_consume: {
1355 Info.memVT = MVT::getVT(CI.getType());
1356 Info.ptrVal = CI.getOperand(0);
1357 Info.align.reset();
1359
1360 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1361 if (!Vol->isZero())
1363
1364 return true;
1365 }
1366 case Intrinsic::amdgcn_global_atomic_csub: {
1368 Info.memVT = MVT::getVT(CI.getType());
1369 Info.ptrVal = CI.getOperand(0);
1370 Info.align.reset();
1373 return true;
1374 }
1375 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1377 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1378
1379 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1380 Info.align.reset();
1381 Info.flags |=
1383 return true;
1384 }
1385 case Intrinsic::amdgcn_global_atomic_fmin_num:
1386 case Intrinsic::amdgcn_global_atomic_fmax_num:
1387 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1388 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1389 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1390 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1392 Info.memVT = MVT::getVT(CI.getType());
1393 Info.ptrVal = CI.getOperand(0);
1394 Info.align.reset();
1398 return true;
1399 }
1400 case Intrinsic::amdgcn_global_load_tr_b64:
1401 case Intrinsic::amdgcn_global_load_tr_b128:
1402 case Intrinsic::amdgcn_ds_read_tr4_b64:
1403 case Intrinsic::amdgcn_ds_read_tr6_b96:
1404 case Intrinsic::amdgcn_ds_read_tr8_b64:
1405 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1407 Info.memVT = MVT::getVT(CI.getType());
1408 Info.ptrVal = CI.getOperand(0);
1409 Info.align.reset();
1411 return true;
1412 }
1413 case Intrinsic::amdgcn_ds_gws_init:
1414 case Intrinsic::amdgcn_ds_gws_barrier:
1415 case Intrinsic::amdgcn_ds_gws_sema_v:
1416 case Intrinsic::amdgcn_ds_gws_sema_br:
1417 case Intrinsic::amdgcn_ds_gws_sema_p:
1418 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1420
1421 const GCNTargetMachine &TM =
1422 static_cast<const GCNTargetMachine &>(getTargetMachine());
1423
1425 Info.ptrVal = MFI->getGWSPSV(TM);
1426
1427 // This is an abstract access, but we need to specify a type and size.
1428 Info.memVT = MVT::i32;
1429 Info.size = 4;
1430 Info.align = Align(4);
1431
1432 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1434 else
1436 return true;
1437 }
1438 case Intrinsic::amdgcn_global_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1441 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1442 Info.ptrVal = CI.getArgOperand(1);
1444 return true;
1445 }
1446 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1448
1449 const GCNTargetMachine &TM =
1450 static_cast<const GCNTargetMachine &>(getTargetMachine());
1451
1453 Info.ptrVal = MFI->getGWSPSV(TM);
1454
1455 // This is an abstract access, but we need to specify a type and size.
1456 Info.memVT = MVT::i32;
1457 Info.size = 4;
1458 Info.align = Align(4);
1459
1461 return true;
1462 }
1463 case Intrinsic::amdgcn_s_prefetch_data: {
1465 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1466 Info.ptrVal = CI.getArgOperand(0);
1468 return true;
1469 }
1470 default:
1471 return false;
1472 }
1473}
1474
1476 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1477 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1478 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1479 // The DAG's ValueType loses the addrspaces.
1480 // Add them as 2 extra Constant operands "from" and "to".
1481 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1482 unsigned DstAS = I.getType()->getPointerAddressSpace();
1483 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1484 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1485 break;
1486 }
1487 default:
1488 break;
1489 }
1490}
1491
1494 Type *&AccessTy) const {
1495 Value *Ptr = nullptr;
1496 switch (II->getIntrinsicID()) {
1497 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1498 case Intrinsic::amdgcn_ds_append:
1499 case Intrinsic::amdgcn_ds_consume:
1500 case Intrinsic::amdgcn_ds_read_tr4_b64:
1501 case Intrinsic::amdgcn_ds_read_tr6_b96:
1502 case Intrinsic::amdgcn_ds_read_tr8_b64:
1503 case Intrinsic::amdgcn_ds_read_tr16_b64:
1504 case Intrinsic::amdgcn_ds_ordered_add:
1505 case Intrinsic::amdgcn_ds_ordered_swap:
1506 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1507 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1508 case Intrinsic::amdgcn_global_atomic_csub:
1509 case Intrinsic::amdgcn_global_atomic_fmax_num:
1510 case Intrinsic::amdgcn_global_atomic_fmin_num:
1511 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1512 case Intrinsic::amdgcn_global_load_tr_b64:
1513 case Intrinsic::amdgcn_global_load_tr_b128:
1514 Ptr = II->getArgOperand(0);
1515 break;
1516 case Intrinsic::amdgcn_global_load_lds:
1517 Ptr = II->getArgOperand(1);
1518 break;
1519 default:
1520 return false;
1521 }
1522 AccessTy = II->getType();
1523 Ops.push_back(Ptr);
1524 return true;
1525}
1526
1528 unsigned AddrSpace) const {
1529 if (!Subtarget->hasFlatInstOffsets()) {
1530 // Flat instructions do not have offsets, and only have the register
1531 // address.
1532 return AM.BaseOffs == 0 && AM.Scale == 0;
1533 }
1534
1535 decltype(SIInstrFlags::FLAT) FlatVariant =
1539
1540 return AM.Scale == 0 &&
1541 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1542 AM.BaseOffs, AddrSpace, FlatVariant));
1543}
1544
1546 if (Subtarget->hasFlatGlobalInsts())
1548
1549 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1550 // Assume the we will use FLAT for all global memory accesses
1551 // on VI.
1552 // FIXME: This assumption is currently wrong. On VI we still use
1553 // MUBUF instructions for the r + i addressing mode. As currently
1554 // implemented, the MUBUF instructions only work on buffer < 4GB.
1555 // It may be possible to support > 4GB buffers with MUBUF instructions,
1556 // by setting the stride value in the resource descriptor which would
1557 // increase the size limit to (stride * 4GB). However, this is risky,
1558 // because it has never been validated.
1560 }
1561
1562 return isLegalMUBUFAddressingMode(AM);
1563}
1564
1565bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1566 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1567 // additionally can do r + r + i with addr64. 32-bit has more addressing
1568 // mode options. Depending on the resource constant, it can also do
1569 // (i64 r0) + (i32 r1) * (i14 i).
1570 //
1571 // Private arrays end up using a scratch buffer most of the time, so also
1572 // assume those use MUBUF instructions. Scratch loads / stores are currently
1573 // implemented as mubuf instructions with offen bit set, so slightly
1574 // different than the normal addr64.
1575 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1576 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1577 return false;
1578
1579 // FIXME: Since we can split immediate into soffset and immediate offset,
1580 // would it make sense to allow any immediate?
1581
1582 switch (AM.Scale) {
1583 case 0: // r + i or just i, depending on HasBaseReg.
1584 return true;
1585 case 1:
1586 return true; // We have r + r or r + i.
1587 case 2:
1588 if (AM.HasBaseReg) {
1589 // Reject 2 * r + r.
1590 return false;
1591 }
1592
1593 // Allow 2 * r as r + r
1594 // Or 2 * r + i is allowed as r + r + i.
1595 return true;
1596 default: // Don't allow n * r
1597 return false;
1598 }
1599}
1600
1602 const AddrMode &AM, Type *Ty,
1603 unsigned AS,
1604 Instruction *I) const {
1605 // No global is ever allowed as a base.
1606 if (AM.BaseGV)
1607 return false;
1608
1609 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1610 return isLegalGlobalAddressingMode(AM);
1611
1612 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1616 // If the offset isn't a multiple of 4, it probably isn't going to be
1617 // correctly aligned.
1618 // FIXME: Can we get the real alignment here?
1619 if (AM.BaseOffs % 4 != 0)
1620 return isLegalMUBUFAddressingMode(AM);
1621
1622 if (!Subtarget->hasScalarSubwordLoads()) {
1623 // There are no SMRD extloads, so if we have to do a small type access we
1624 // will use a MUBUF load.
1625 // FIXME?: We also need to do this if unaligned, but we don't know the
1626 // alignment here.
1627 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1628 return isLegalGlobalAddressingMode(AM);
1629 }
1630
1632 // SMRD instructions have an 8-bit, dword offset on SI.
1633 if (!isUInt<8>(AM.BaseOffs / 4))
1634 return false;
1635 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1636 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1637 // in 8-bits, it can use a smaller encoding.
1638 if (!isUInt<32>(AM.BaseOffs / 4))
1639 return false;
1640 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1641 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1642 if (!isUInt<20>(AM.BaseOffs))
1643 return false;
1644 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1645 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1646 // for S_BUFFER_* instructions).
1647 if (!isInt<21>(AM.BaseOffs))
1648 return false;
1649 } else {
1650 // On GFX12, all offsets are signed 24-bit in bytes.
1651 if (!isInt<24>(AM.BaseOffs))
1652 return false;
1653 }
1654
1655 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1657 AM.BaseOffs < 0) {
1658 // Scalar (non-buffer) loads can only use a negative offset if
1659 // soffset+offset is non-negative. Since the compiler can only prove that
1660 // in a few special cases, it is safer to claim that negative offsets are
1661 // not supported.
1662 return false;
1663 }
1664
1665 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1666 return true;
1667
1668 if (AM.Scale == 1 && AM.HasBaseReg)
1669 return true;
1670
1671 return false;
1672 }
1673
1674 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1675 return Subtarget->enableFlatScratch()
1677 : isLegalMUBUFAddressingMode(AM);
1678
1679 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1680 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1681 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1682 // field.
1683 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1684 // an 8-bit dword offset but we don't know the alignment here.
1685 if (!isUInt<16>(AM.BaseOffs))
1686 return false;
1687
1688 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1689 return true;
1690
1691 if (AM.Scale == 1 && AM.HasBaseReg)
1692 return true;
1693
1694 return false;
1695 }
1696
1698 // For an unknown address space, this usually means that this is for some
1699 // reason being used for pure arithmetic, and not based on some addressing
1700 // computation. We don't have instructions that compute pointers with any
1701 // addressing modes, so treat them as having no offset like flat
1702 // instructions.
1704 }
1705
1706 // Assume a user alias of global for unknown address spaces.
1707 return isLegalGlobalAddressingMode(AM);
1708}
1709
1711 const MachineFunction &MF) const {
1713 return (MemVT.getSizeInBits() <= 4 * 32);
1714 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1715 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1716 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1717 }
1719 return (MemVT.getSizeInBits() <= 2 * 32);
1720 return true;
1721}
1722
1724 unsigned Size, unsigned AddrSpace, Align Alignment,
1725 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1726 if (IsFast)
1727 *IsFast = 0;
1728
1729 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1730 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1731 // Check if alignment requirements for ds_read/write instructions are
1732 // disabled.
1733 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1734 return false;
1735
1736 Align RequiredAlignment(
1737 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1738 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1739 Alignment < RequiredAlignment)
1740 return false;
1741
1742 // Either, the alignment requirements are "enabled", or there is an
1743 // unaligned LDS access related hardware bug though alignment requirements
1744 // are "disabled". In either case, we need to check for proper alignment
1745 // requirements.
1746 //
1747 switch (Size) {
1748 case 64:
1749 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1750 // address is negative, then the instruction is incorrectly treated as
1751 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1752 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1753 // load later in the SILoadStoreOptimizer.
1754 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1755 return false;
1756
1757 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1758 // can do a 4 byte aligned, 8 byte access in a single operation using
1759 // ds_read2/write2_b32 with adjacent offsets.
1760 RequiredAlignment = Align(4);
1761
1762 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1763 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1764 // ds_write2_b32 depending on the alignment. In either case with either
1765 // alignment there is no faster way of doing this.
1766
1767 // The numbers returned here and below are not additive, it is a 'speed
1768 // rank'. They are just meant to be compared to decide if a certain way
1769 // of lowering an operation is faster than another. For that purpose
1770 // naturally aligned operation gets it bitsize to indicate that "it
1771 // operates with a speed comparable to N-bit wide load". With the full
1772 // alignment ds128 is slower than ds96 for example. If underaligned it
1773 // is comparable to a speed of a single dword access, which would then
1774 // mean 32 < 128 and it is faster to issue a wide load regardless.
1775 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1776 // wider load which will not be aligned anymore the latter is slower.
1777 if (IsFast)
1778 *IsFast = (Alignment >= RequiredAlignment) ? 64
1779 : (Alignment < Align(4)) ? 32
1780 : 1;
1781 return true;
1782 }
1783
1784 break;
1785 case 96:
1786 if (!Subtarget->hasDS96AndDS128())
1787 return false;
1788
1789 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1790 // gfx8 and older.
1791
1792 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1793 // Naturally aligned access is fastest. However, also report it is Fast
1794 // if memory is aligned less than DWORD. A narrow load or store will be
1795 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1796 // be more of them, so overall we will pay less penalty issuing a single
1797 // instruction.
1798
1799 // See comment on the values above.
1800 if (IsFast)
1801 *IsFast = (Alignment >= RequiredAlignment) ? 96
1802 : (Alignment < Align(4)) ? 32
1803 : 1;
1804 return true;
1805 }
1806
1807 break;
1808 case 128:
1809 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1810 return false;
1811
1812 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1813 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1814 // single operation using ds_read2/write2_b64.
1815 RequiredAlignment = Align(8);
1816
1817 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1818 // Naturally aligned access is fastest. However, also report it is Fast
1819 // if memory is aligned less than DWORD. A narrow load or store will be
1820 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1821 // will be more of them, so overall we will pay less penalty issuing a
1822 // single instruction.
1823
1824 // See comment on the values above.
1825 if (IsFast)
1826 *IsFast = (Alignment >= RequiredAlignment) ? 128
1827 : (Alignment < Align(4)) ? 32
1828 : 1;
1829 return true;
1830 }
1831
1832 break;
1833 default:
1834 if (Size > 32)
1835 return false;
1836
1837 break;
1838 }
1839
1840 // See comment on the values above.
1841 // Note that we have a single-dword or sub-dword here, so if underaligned
1842 // it is a slowest possible access, hence returned value is 0.
1843 if (IsFast)
1844 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1845
1846 return Alignment >= RequiredAlignment ||
1847 Subtarget->hasUnalignedDSAccessEnabled();
1848 }
1849
1850 // FIXME: We have to be conservative here and assume that flat operations
1851 // will access scratch. If we had access to the IR function, then we
1852 // could determine if any private memory was used in the function.
1853 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1854 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1855 bool AlignedBy4 = Alignment >= Align(4);
1856 if (IsFast)
1857 *IsFast = AlignedBy4;
1858
1859 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1860 }
1861
1862 // So long as they are correct, wide global memory operations perform better
1863 // than multiple smaller memory ops -- even when misaligned
1864 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1865 if (IsFast)
1866 *IsFast = Size;
1867
1868 return Alignment >= Align(4) ||
1870 }
1871
1872 // Smaller than dword value must be aligned.
1873 if (Size < 32)
1874 return false;
1875
1876 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1877 // byte-address are ignored, thus forcing Dword alignment.
1878 // This applies to private, global, and constant memory.
1879 if (IsFast)
1880 *IsFast = 1;
1881
1882 return Size >= 32 && Alignment >= Align(4);
1883}
1884
1886 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1887 unsigned *IsFast) const {
1889 Alignment, Flags, IsFast);
1890}
1891
1893 const MemOp &Op, const AttributeList &FuncAttributes) const {
1894 // FIXME: Should account for address space here.
1895
1896 // The default fallback uses the private pointer size as a guess for a type to
1897 // use. Make sure we switch these to 64-bit accesses.
1898
1899 if (Op.size() >= 16 &&
1900 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1901 return MVT::v4i32;
1902
1903 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1904 return MVT::v2i32;
1905
1906 // Use the default.
1907 return MVT::Other;
1908}
1909
1911 const MemSDNode *MemNode = cast<MemSDNode>(N);
1912 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1913}
1914
1916 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1918}
1919
1921 unsigned DestAS) const {
1922 // Flat -> private/local is a simple truncate.
1923 // Flat -> global is no-op
1924 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1925 return true;
1926
1927 const GCNTargetMachine &TM =
1928 static_cast<const GCNTargetMachine &>(getTargetMachine());
1929 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1930}
1931
1934 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1935 VT.getScalarType().bitsLE(MVT::i16))
1938}
1939
1941 Type *Ty) const {
1942 // FIXME: Could be smarter if called for vector constants.
1943 return true;
1944}
1945
1947 unsigned Index) const {
1949 return false;
1950
1951 // TODO: Add more cases that are cheap.
1952 return Index == 0;
1953}
1954
1955bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
1956 // TODO: This should be more aggressive, particular for 16-bit element
1957 // vectors. However there are some mixed improvements and regressions.
1958 EVT EltTy = VT.getVectorElementType();
1959 return EltTy.getSizeInBits() % 32 == 0;
1960}
1961
1963 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1964 switch (Op) {
1965 case ISD::LOAD:
1966 case ISD::STORE:
1967 return true;
1968 default:
1969 return false;
1970 }
1971 }
1972
1973 // SimplifySetCC uses this function to determine whether or not it should
1974 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1975 if (VT == MVT::i1 && Op == ISD::SETCC)
1976 return false;
1977
1979}
1980
1981SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1982 const SDLoc &SL,
1983 SDValue Chain,
1984 uint64_t Offset) const {
1985 const DataLayout &DL = DAG.getDataLayout();
1989
1990 auto [InputPtrReg, RC, ArgTy] =
1992
1993 // We may not have the kernarg segment argument if we have no kernel
1994 // arguments.
1995 if (!InputPtrReg)
1996 return DAG.getConstant(Offset, SL, PtrVT);
1997
1999 SDValue BasePtr = DAG.getCopyFromReg(
2000 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2001
2002 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2003}
2004
2005SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2006 const SDLoc &SL) const {
2009 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2010}
2011
2012SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2013 const SDLoc &SL) const {
2014
2016 std::optional<uint32_t> KnownSize =
2018 if (KnownSize.has_value())
2019 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2020 return SDValue();
2021}
2022
2023SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2024 const SDLoc &SL, SDValue Val,
2025 bool Signed,
2026 const ISD::InputArg *Arg) const {
2027 // First, if it is a widened vector, narrow it.
2028 if (VT.isVector() &&
2030 EVT NarrowedVT =
2033 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2034 DAG.getConstant(0, SL, MVT::i32));
2035 }
2036
2037 // Then convert the vector elements or scalar value.
2038 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2039 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2040 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2041 }
2042
2043 if (MemVT.isFloatingPoint())
2044 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2045 else if (Signed)
2046 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2047 else
2048 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2049
2050 return Val;
2051}
2052
2053SDValue SITargetLowering::lowerKernargMemParameter(
2054 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2055 uint64_t Offset, Align Alignment, bool Signed,
2056 const ISD::InputArg *Arg) const {
2058
2059 // Try to avoid using an extload by loading earlier than the argument address,
2060 // and extracting the relevant bits. The load should hopefully be merged with
2061 // the previous argument.
2062 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2063 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2064 int64_t AlignDownOffset = alignDown(Offset, 4);
2065 int64_t OffsetDiff = Offset - AlignDownOffset;
2066
2067 EVT IntVT = MemVT.changeTypeToInteger();
2068
2069 // TODO: If we passed in the base kernel offset we could have a better
2070 // alignment than 4, but we don't really need it.
2071 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2072 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2075
2076 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2077 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2078
2079 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2080 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2081 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2082
2083 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2084 }
2085
2086 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2087 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2090
2091 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2092 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2093}
2094
2095SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2096 CCValAssign &VA, const SDLoc &SL,
2097 SDValue Chain,
2098 const ISD::InputArg &Arg) const {
2100 MachineFrameInfo &MFI = MF.getFrameInfo();
2101
2102 if (Arg.Flags.isByVal()) {
2103 unsigned Size = Arg.Flags.getByValSize();
2104 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2105 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2106 }
2107
2108 unsigned ArgOffset = VA.getLocMemOffset();
2109 unsigned ArgSize = VA.getValVT().getStoreSize();
2110
2111 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2112
2113 // Create load nodes to retrieve arguments from the stack.
2114 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2115 SDValue ArgValue;
2116
2117 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2119 MVT MemVT = VA.getValVT();
2120
2121 switch (VA.getLocInfo()) {
2122 default:
2123 break;
2124 case CCValAssign::BCvt:
2125 MemVT = VA.getLocVT();
2126 break;
2127 case CCValAssign::SExt:
2128 ExtType = ISD::SEXTLOAD;
2129 break;
2130 case CCValAssign::ZExt:
2131 ExtType = ISD::ZEXTLOAD;
2132 break;
2133 case CCValAssign::AExt:
2134 ExtType = ISD::EXTLOAD;
2135 break;
2136 }
2137
2138 ArgValue = DAG.getExtLoad(
2139 ExtType, SL, VA.getLocVT(), Chain, FIN,
2141 return ArgValue;
2142}
2143
2144SDValue SITargetLowering::getPreloadedValue(
2145 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2147 const ArgDescriptor *Reg = nullptr;
2148 const TargetRegisterClass *RC;
2149 LLT Ty;
2150
2152 const ArgDescriptor WorkGroupIDX =
2153 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2154 // If GridZ is not programmed in an entry function then the hardware will set
2155 // it to all zeros, so there is no need to mask the GridY value in the low
2156 // order bits.
2157 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2158 AMDGPU::TTMP7,
2159 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2160 const ArgDescriptor WorkGroupIDZ =
2161 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2162 if (Subtarget->hasArchitectedSGPRs() &&
2164 switch (PVID) {
2166 Reg = &WorkGroupIDX;
2167 RC = &AMDGPU::SReg_32RegClass;
2168 Ty = LLT::scalar(32);
2169 break;
2171 Reg = &WorkGroupIDY;
2172 RC = &AMDGPU::SReg_32RegClass;
2173 Ty = LLT::scalar(32);
2174 break;
2176 Reg = &WorkGroupIDZ;
2177 RC = &AMDGPU::SReg_32RegClass;
2178 Ty = LLT::scalar(32);
2179 break;
2180 default:
2181 break;
2182 }
2183 }
2184
2185 if (!Reg)
2186 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2187 if (!Reg) {
2189 // It's possible for a kernarg intrinsic call to appear in a kernel with
2190 // no allocated segment, in which case we do not add the user sgpr
2191 // argument, so just return null.
2192 return DAG.getConstant(0, SDLoc(), VT);
2193 }
2194
2195 // It's undefined behavior if a function marked with the amdgpu-no-*
2196 // attributes uses the corresponding intrinsic.
2197 return DAG.getUNDEF(VT);
2198 }
2199
2200 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2201}
2202
2204 CallingConv::ID CallConv,
2205 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2206 FunctionType *FType,
2207 SIMachineFunctionInfo *Info) {
2208 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2209 const ISD::InputArg *Arg = &Ins[I];
2210
2211 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2212 "vector type argument should have been split");
2213
2214 // First check if it's a PS input addr.
2215 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2216 PSInputNum <= 15) {
2217 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2218
2219 // Inconveniently only the first part of the split is marked as isSplit,
2220 // so skip to the end. We only want to increment PSInputNum once for the
2221 // entire split argument.
2222 if (Arg->Flags.isSplit()) {
2223 while (!Arg->Flags.isSplitEnd()) {
2224 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2225 "unexpected vector split in ps argument type");
2226 if (!SkipArg)
2227 Splits.push_back(*Arg);
2228 Arg = &Ins[++I];
2229 }
2230 }
2231
2232 if (SkipArg) {
2233 // We can safely skip PS inputs.
2234 Skipped.set(Arg->getOrigArgIndex());
2235 ++PSInputNum;
2236 continue;
2237 }
2238
2239 Info->markPSInputAllocated(PSInputNum);
2240 if (Arg->Used)
2241 Info->markPSInputEnabled(PSInputNum);
2242
2243 ++PSInputNum;
2244 }
2245
2246 Splits.push_back(*Arg);
2247 }
2248}
2249
2250// Allocate special inputs passed in VGPRs.
2252 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2253 SIMachineFunctionInfo &Info) const {
2254 const LLT S32 = LLT::scalar(32);
2256
2257 if (Info.hasWorkItemIDX()) {
2258 Register Reg = AMDGPU::VGPR0;
2259 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2260
2261 CCInfo.AllocateReg(Reg);
2262 unsigned Mask =
2263 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2264 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2265 }
2266
2267 if (Info.hasWorkItemIDY()) {
2268 assert(Info.hasWorkItemIDX());
2269 if (Subtarget->hasPackedTID()) {
2270 Info.setWorkItemIDY(
2271 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2272 } else {
2273 unsigned Reg = AMDGPU::VGPR1;
2274 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2275
2276 CCInfo.AllocateReg(Reg);
2277 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2278 }
2279 }
2280
2281 if (Info.hasWorkItemIDZ()) {
2282 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2283 if (Subtarget->hasPackedTID()) {
2284 Info.setWorkItemIDZ(
2285 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2286 } else {
2287 unsigned Reg = AMDGPU::VGPR2;
2288 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2289
2290 CCInfo.AllocateReg(Reg);
2291 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2292 }
2293 }
2294}
2295
2296// Try to allocate a VGPR at the end of the argument list, or if no argument
2297// VGPRs are left allocating a stack slot.
2298// If \p Mask is is given it indicates bitfield position in the register.
2299// If \p Arg is given use it with new ]p Mask instead of allocating new.
2300static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2301 ArgDescriptor Arg = ArgDescriptor()) {
2302 if (Arg.isSet())
2303 return ArgDescriptor::createArg(Arg, Mask);
2304
2305 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2306 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2307 if (RegIdx == ArgVGPRs.size()) {
2308 // Spill to stack required.
2309 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2310
2311 return ArgDescriptor::createStack(Offset, Mask);
2312 }
2313
2314 unsigned Reg = ArgVGPRs[RegIdx];
2315 Reg = CCInfo.AllocateReg(Reg);
2316 assert(Reg != AMDGPU::NoRegister);
2317
2318 MachineFunction &MF = CCInfo.getMachineFunction();
2319 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2320 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2321 return ArgDescriptor::createRegister(Reg, Mask);
2322}
2323
2325 const TargetRegisterClass *RC,
2326 unsigned NumArgRegs) {
2327 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2328 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2329 if (RegIdx == ArgSGPRs.size())
2330 report_fatal_error("ran out of SGPRs for arguments");
2331
2332 unsigned Reg = ArgSGPRs[RegIdx];
2333 Reg = CCInfo.AllocateReg(Reg);
2334 assert(Reg != AMDGPU::NoRegister);
2335
2336 MachineFunction &MF = CCInfo.getMachineFunction();
2337 MF.addLiveIn(Reg, RC);
2339}
2340
2341// If this has a fixed position, we still should allocate the register in the
2342// CCInfo state. Technically we could get away with this for values passed
2343// outside of the normal argument range.
2345 const TargetRegisterClass *RC,
2346 MCRegister Reg) {
2347 Reg = CCInfo.AllocateReg(Reg);
2348 assert(Reg != AMDGPU::NoRegister);
2349 MachineFunction &MF = CCInfo.getMachineFunction();
2350 MF.addLiveIn(Reg, RC);
2351}
2352
2353static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2354 if (Arg) {
2355 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2356 Arg.getRegister());
2357 } else
2358 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2359}
2360
2361static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2362 if (Arg) {
2363 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2364 Arg.getRegister());
2365 } else
2366 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2367}
2368
2369/// Allocate implicit function VGPR arguments at the end of allocated user
2370/// arguments.
2372 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2373 SIMachineFunctionInfo &Info) const {
2374 const unsigned Mask = 0x3ff;
2375 ArgDescriptor Arg;
2376
2377 if (Info.hasWorkItemIDX()) {
2378 Arg = allocateVGPR32Input(CCInfo, Mask);
2379 Info.setWorkItemIDX(Arg);
2380 }
2381
2382 if (Info.hasWorkItemIDY()) {
2383 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2384 Info.setWorkItemIDY(Arg);
2385 }
2386
2387 if (Info.hasWorkItemIDZ())
2388 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2389}
2390
2391/// Allocate implicit function VGPR arguments in fixed registers.
2393 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2394 SIMachineFunctionInfo &Info) const {
2395 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2396 if (!Reg)
2397 report_fatal_error("failed to allocated VGPR for implicit arguments");
2398
2399 const unsigned Mask = 0x3ff;
2400 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2401 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2402 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2403}
2404
2406 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2407 SIMachineFunctionInfo &Info) const {
2408 auto &ArgInfo = Info.getArgInfo();
2409 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2410
2411 // TODO: Unify handling with private memory pointers.
2412 if (UserSGPRInfo.hasDispatchPtr())
2413 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2414
2415 if (UserSGPRInfo.hasQueuePtr())
2416 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2417
2418 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2419 // constant offset from the kernarg segment.
2420 if (Info.hasImplicitArgPtr())
2421 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2422
2423 if (UserSGPRInfo.hasDispatchID())
2424 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2425
2426 // flat_scratch_init is not applicable for non-kernel functions.
2427
2428 if (Info.hasWorkGroupIDX())
2429 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2430
2431 if (Info.hasWorkGroupIDY())
2432 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2433
2434 if (Info.hasWorkGroupIDZ())
2435 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2436
2437 if (Info.hasLDSKernelId())
2438 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2439}
2440
2441// Allocate special inputs passed in user SGPRs.
2443 MachineFunction &MF,
2444 const SIRegisterInfo &TRI,
2445 SIMachineFunctionInfo &Info) const {
2446 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2447 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2448 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2449 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2450 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2451 }
2452
2453 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2454 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2455 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2456 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2457 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2458 }
2459
2460 if (UserSGPRInfo.hasDispatchPtr()) {
2461 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2462 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2463 CCInfo.AllocateReg(DispatchPtrReg);
2464 }
2465
2466 if (UserSGPRInfo.hasQueuePtr()) {
2467 Register QueuePtrReg = Info.addQueuePtr(TRI);
2468 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2469 CCInfo.AllocateReg(QueuePtrReg);
2470 }
2471
2472 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2474 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2475 CCInfo.AllocateReg(InputPtrReg);
2476
2477 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2478 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2479 }
2480
2481 if (UserSGPRInfo.hasDispatchID()) {
2482 Register DispatchIDReg = Info.addDispatchID(TRI);
2483 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2484 CCInfo.AllocateReg(DispatchIDReg);
2485 }
2486
2487 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2488 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2489 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2490 CCInfo.AllocateReg(FlatScratchInitReg);
2491 }
2492
2493 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2494 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2495 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2496 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2497 }
2498
2499 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2500 // these from the dispatch pointer.
2501}
2502
2503// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2504// sequential starting from the first argument.
2506 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2508 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2509 Function &F = MF.getFunction();
2510 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2511 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2512 bool InPreloadSequence = true;
2513 unsigned InIdx = 0;
2514 bool AlignedForImplictArgs = false;
2515 unsigned ImplicitArgOffset = 0;
2516 for (auto &Arg : F.args()) {
2517 if (!InPreloadSequence || !Arg.hasInRegAttr())
2518 break;
2519
2520 unsigned ArgIdx = Arg.getArgNo();
2521 // Don't preload non-original args or parts not in the current preload
2522 // sequence.
2523 if (InIdx < Ins.size() &&
2524 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2525 break;
2526
2527 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2528 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2529 InIdx++) {
2530 assert(ArgLocs[ArgIdx].isMemLoc());
2531 auto &ArgLoc = ArgLocs[InIdx];
2532 const Align KernelArgBaseAlign = Align(16);
2533 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2534 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2535 unsigned NumAllocSGPRs =
2536 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2537
2538 // Fix alignment for hidden arguments.
2539 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2540 if (!AlignedForImplictArgs) {
2541 ImplicitArgOffset =
2542 alignTo(LastExplicitArgOffset,
2543 Subtarget->getAlignmentForImplicitArgPtr()) -
2544 LastExplicitArgOffset;
2545 AlignedForImplictArgs = true;
2546 }
2547 ArgOffset += ImplicitArgOffset;
2548 }
2549
2550 // Arg is preloaded into the previous SGPR.
2551 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2552 assert(InIdx >= 1 && "No previous SGPR");
2553 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2554 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2555 continue;
2556 }
2557
2558 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2559 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2560 // Check for free user SGPRs for preloading.
2561 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2562 InPreloadSequence = false;
2563 break;
2564 }
2565
2566 // Preload this argument.
2567 const TargetRegisterClass *RC =
2568 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2569 SmallVectorImpl<MCRegister> *PreloadRegs =
2570 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2571
2572 if (PreloadRegs->size() > 1)
2573 RC = &AMDGPU::SGPR_32RegClass;
2574 for (auto &Reg : *PreloadRegs) {
2575 assert(Reg);
2576 MF.addLiveIn(Reg, RC);
2577 CCInfo.AllocateReg(Reg);
2578 }
2579
2580 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2581 }
2582 }
2583}
2584
2586 const SIRegisterInfo &TRI,
2587 SIMachineFunctionInfo &Info) const {
2588 // Always allocate this last since it is a synthetic preload.
2589 if (Info.hasLDSKernelId()) {
2590 Register Reg = Info.addLDSKernelId();
2591 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2592 CCInfo.AllocateReg(Reg);
2593 }
2594}
2595
2596// Allocate special input registers that are initialized per-wave.
2599 CallingConv::ID CallConv,
2600 bool IsShader) const {
2601 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2602 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2603 // Note: user SGPRs are handled by the front-end for graphics shaders
2604 // Pad up the used user SGPRs with dead inputs.
2605
2606 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2607 // before enabling architected SGPRs for workgroup IDs.
2608 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2609
2610 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2611 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2612 // rely on it to reach 16 since if we end up having no stack usage, it will
2613 // not really be added.
2614 unsigned NumRequiredSystemSGPRs =
2615 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2616 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2617 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2618 Register Reg = Info.addReservedUserSGPR();
2619 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 CCInfo.AllocateReg(Reg);
2621 }
2622 }
2623
2624 if (!HasArchitectedSGPRs) {
2625 if (Info.hasWorkGroupIDX()) {
2626 Register Reg = Info.addWorkGroupIDX();
2627 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2628 CCInfo.AllocateReg(Reg);
2629 }
2630
2631 if (Info.hasWorkGroupIDY()) {
2632 Register Reg = Info.addWorkGroupIDY();
2633 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2634 CCInfo.AllocateReg(Reg);
2635 }
2636
2637 if (Info.hasWorkGroupIDZ()) {
2638 Register Reg = Info.addWorkGroupIDZ();
2639 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2640 CCInfo.AllocateReg(Reg);
2641 }
2642 }
2643
2644 if (Info.hasWorkGroupInfo()) {
2645 Register Reg = Info.addWorkGroupInfo();
2646 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2647 CCInfo.AllocateReg(Reg);
2648 }
2649
2650 if (Info.hasPrivateSegmentWaveByteOffset()) {
2651 // Scratch wave offset passed in system SGPR.
2652 unsigned PrivateSegmentWaveByteOffsetReg;
2653
2654 if (IsShader) {
2655 PrivateSegmentWaveByteOffsetReg =
2656 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2657
2658 // This is true if the scratch wave byte offset doesn't have a fixed
2659 // location.
2660 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2661 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2662 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2663 }
2664 } else
2665 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2666
2667 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2668 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2669 }
2670
2671 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2672 Info.getNumPreloadedSGPRs() >= 16);
2673}
2674
2676 MachineFunction &MF,
2677 const SIRegisterInfo &TRI,
2678 SIMachineFunctionInfo &Info) {
2679 // Now that we've figured out where the scratch register inputs are, see if
2680 // should reserve the arguments and use them directly.
2681 MachineFrameInfo &MFI = MF.getFrameInfo();
2682 bool HasStackObjects = MFI.hasStackObjects();
2683 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2684
2685 // Record that we know we have non-spill stack objects so we don't need to
2686 // check all stack objects later.
2687 if (HasStackObjects)
2688 Info.setHasNonSpillStackObjects(true);
2689
2690 // Everything live out of a block is spilled with fast regalloc, so it's
2691 // almost certain that spilling will be required.
2692 if (TM.getOptLevel() == CodeGenOptLevel::None)
2693 HasStackObjects = true;
2694
2695 // For now assume stack access is needed in any callee functions, so we need
2696 // the scratch registers to pass in.
2697 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2698
2699 if (!ST.enableFlatScratch()) {
2700 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2701 // If we have stack objects, we unquestionably need the private buffer
2702 // resource. For the Code Object V2 ABI, this will be the first 4 user
2703 // SGPR inputs. We can reserve those and use them directly.
2704
2705 Register PrivateSegmentBufferReg =
2707 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2708 } else {
2709 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2710 // We tentatively reserve the last registers (skipping the last registers
2711 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2712 // we'll replace these with the ones immediately after those which were
2713 // really allocated. In the prologue copies will be inserted from the
2714 // argument to these reserved registers.
2715
2716 // Without HSA, relocations are used for the scratch pointer and the
2717 // buffer resource setup is always inserted in the prologue. Scratch wave
2718 // offset is still in an input SGPR.
2719 Info.setScratchRSrcReg(ReservedBufferReg);
2720 }
2721 }
2722
2724
2725 // For entry functions we have to set up the stack pointer if we use it,
2726 // whereas non-entry functions get this "for free". This means there is no
2727 // intrinsic advantage to using S32 over S34 in cases where we do not have
2728 // calls but do need a frame pointer (i.e. if we are requested to have one
2729 // because frame pointer elimination is disabled). To keep things simple we
2730 // only ever use S32 as the call ABI stack pointer, and so using it does not
2731 // imply we need a separate frame pointer.
2732 //
2733 // Try to use s32 as the SP, but move it if it would interfere with input
2734 // arguments. This won't work with calls though.
2735 //
2736 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2737 // registers.
2738 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2739 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2740 } else {
2742
2743 if (MFI.hasCalls())
2744 report_fatal_error("call in graphics shader with too many input SGPRs");
2745
2746 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2747 if (!MRI.isLiveIn(Reg)) {
2748 Info.setStackPtrOffsetReg(Reg);
2749 break;
2750 }
2751 }
2752
2753 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2754 report_fatal_error("failed to find register for SP");
2755 }
2756
2757 // hasFP should be accurate for entry functions even before the frame is
2758 // finalized, because it does not rely on the known stack size, only
2759 // properties like whether variable sized objects are present.
2760 if (ST.getFrameLowering()->hasFP(MF)) {
2761 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2762 }
2763}
2764
2767 return !Info->isEntryFunction();
2768}
2769
2771
2773 MachineBasicBlock *Entry,
2774 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2776
2777 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2778 if (!IStart)
2779 return;
2780
2781 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2782 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2783 MachineBasicBlock::iterator MBBI = Entry->begin();
2784 for (const MCPhysReg *I = IStart; *I; ++I) {
2785 const TargetRegisterClass *RC = nullptr;
2786 if (AMDGPU::SReg_64RegClass.contains(*I))
2787 RC = &AMDGPU::SGPR_64RegClass;
2788 else if (AMDGPU::SReg_32RegClass.contains(*I))
2789 RC = &AMDGPU::SGPR_32RegClass;
2790 else
2791 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2792
2793 Register NewVR = MRI->createVirtualRegister(RC);
2794 // Create copy from CSR to a virtual register.
2795 Entry->addLiveIn(*I);
2796 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2797 .addReg(*I);
2798
2799 // Insert the copy-back instructions right before the terminator.
2800 for (auto *Exit : Exits)
2801 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2802 TII->get(TargetOpcode::COPY), *I)
2803 .addReg(NewVR);
2804 }
2805}
2806
2808 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2809 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2810 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2812
2814 const Function &Fn = MF.getFunction();
2817
2818 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2819 DiagnosticInfoUnsupported NoGraphicsHSA(
2820 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2821 DAG.getContext()->diagnose(NoGraphicsHSA);
2822 return DAG.getEntryNode();
2823 }
2824
2827 BitVector Skipped(Ins.size());
2828 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2829 *DAG.getContext());
2830
2831 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2832 bool IsKernel = AMDGPU::isKernel(CallConv);
2833 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2834
2835 if (IsGraphics) {
2836 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2837 assert(!UserSGPRInfo.hasDispatchPtr() &&
2838 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2839 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2840 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2841 (void)UserSGPRInfo;
2842 if (!Subtarget->enableFlatScratch())
2843 assert(!UserSGPRInfo.hasFlatScratchInit());
2844 if ((CallConv != CallingConv::AMDGPU_CS &&
2845 CallConv != CallingConv::AMDGPU_Gfx) ||
2846 !Subtarget->hasArchitectedSGPRs())
2847 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2848 !Info->hasWorkGroupIDZ());
2849 }
2850
2851 if (CallConv == CallingConv::AMDGPU_PS) {
2852 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2853
2854 // At least one interpolation mode must be enabled or else the GPU will
2855 // hang.
2856 //
2857 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2858 // set PSInputAddr, the user wants to enable some bits after the compilation
2859 // based on run-time states. Since we can't know what the final PSInputEna
2860 // will look like, so we shouldn't do anything here and the user should take
2861 // responsibility for the correct programming.
2862 //
2863 // Otherwise, the following restrictions apply:
2864 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2865 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2866 // enabled too.
2867 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2868 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2869 CCInfo.AllocateReg(AMDGPU::VGPR0);
2870 CCInfo.AllocateReg(AMDGPU::VGPR1);
2871 Info->markPSInputAllocated(0);
2872 Info->markPSInputEnabled(0);
2873 }
2874 if (Subtarget->isAmdPalOS()) {
2875 // For isAmdPalOS, the user does not enable some bits after compilation
2876 // based on run-time states; the register values being generated here are
2877 // the final ones set in hardware. Therefore we need to apply the
2878 // workaround to PSInputAddr and PSInputEnable together. (The case where
2879 // a bit is set in PSInputAddr but not PSInputEnable is where the
2880 // frontend set up an input arg for a particular interpolation mode, but
2881 // nothing uses that input arg. Really we should have an earlier pass
2882 // that removes such an arg.)
2883 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2884 if ((PsInputBits & 0x7F) == 0 ||
2885 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2886 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2887 }
2888 } else if (IsKernel) {
2889 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2890 } else {
2891 Splits.append(Ins.begin(), Ins.end());
2892 }
2893
2894 if (IsKernel)
2895 analyzeFormalArgumentsCompute(CCInfo, Ins);
2896
2897 if (IsEntryFunc) {
2898 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2899 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2900 if (IsKernel && Subtarget->hasKernargPreload())
2901 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2902
2903 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2904 } else if (!IsGraphics) {
2905 // For the fixed ABI, pass workitem IDs in the last argument register.
2906 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2907
2908 // FIXME: Sink this into allocateSpecialInputSGPRs
2909 if (!Subtarget->enableFlatScratch())
2910 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2911
2912 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2913 }
2914
2915 if (!IsKernel) {
2916 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2917 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2918 }
2919
2921
2922 // FIXME: This is the minimum kernel argument alignment. We should improve
2923 // this to the maximum alignment of the arguments.
2924 //
2925 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2926 // kern arg offset.
2927 const Align KernelArgBaseAlign = Align(16);
2928
2929 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2930 const ISD::InputArg &Arg = Ins[i];
2931 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2932 InVals.push_back(DAG.getUNDEF(Arg.VT));
2933 continue;
2934 }
2935
2936 CCValAssign &VA = ArgLocs[ArgIdx++];
2937 MVT VT = VA.getLocVT();
2938
2939 if (IsEntryFunc && VA.isMemLoc()) {
2940 VT = Ins[i].VT;
2941 EVT MemVT = VA.getLocVT();
2942
2943 const uint64_t Offset = VA.getLocMemOffset();
2944 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2945
2946 if (Arg.Flags.isByRef()) {
2947 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2948
2949 const GCNTargetMachine &TM =
2950 static_cast<const GCNTargetMachine &>(getTargetMachine());
2951 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2952 Arg.Flags.getPointerAddrSpace())) {
2955 }
2956
2957 InVals.push_back(Ptr);
2958 continue;
2959 }
2960
2961 SDValue NewArg;
2962 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2963 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2964 // In this case the argument is packed into the previous preload SGPR.
2965 int64_t AlignDownOffset = alignDown(Offset, 4);
2966 int64_t OffsetDiff = Offset - AlignDownOffset;
2967 EVT IntVT = MemVT.changeTypeToInteger();
2968
2972 Register Reg =
2973 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2974
2975 assert(Reg);
2976 Register VReg = MRI.getLiveInVirtReg(Reg);
2977 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2978
2979 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2980 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2981
2982 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2983 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2984 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2985 Ins[i].Flags.isSExt(), &Ins[i]);
2986
2987 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2988 } else {
2992 const SmallVectorImpl<MCRegister> &PreloadRegs =
2993 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2994
2995 SDValue Copy;
2996 if (PreloadRegs.size() == 1) {
2997 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2998 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2999 NewArg = DAG.getCopyFromReg(
3000 Chain, DL, VReg,
3002 TRI->getRegSizeInBits(*RC)));
3003
3004 } else {
3005 // If the kernarg alignment does not match the alignment of the SGPR
3006 // tuple RC that can accommodate this argument, it will be built up
3007 // via copies from from the individual SGPRs that the argument was
3008 // preloaded to.
3010 for (auto Reg : PreloadRegs) {
3011 Register VReg = MRI.getLiveInVirtReg(Reg);
3012 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3013 Elts.push_back(Copy);
3014 }
3015 NewArg =
3016 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3017 PreloadRegs.size()),
3018 DL, Elts);
3019 }
3020
3021 // If the argument was preloaded to multiple consecutive 32-bit
3022 // registers because of misalignment between addressable SGPR tuples
3023 // and the argument size, we can still assume that because of kernarg
3024 // segment alignment restrictions that NewArg's size is the same as
3025 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3026 // truncate since we cannot preload to less than a single SGPR and the
3027 // MemVT may be smaller.
3028 EVT MemVTInt =
3030 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3031 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3032
3033 NewArg = DAG.getBitcast(MemVT, NewArg);
3034 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3035 Ins[i].Flags.isSExt(), &Ins[i]);
3036 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3037 }
3038 } else {
3039 // Hidden arguments that are in the kernel signature must be preloaded
3040 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3041 // the argument list and is not preloaded.
3042 if (Arg.isOrigArg()) {
3043 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3044 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3045 DiagnosticInfoUnsupported NonPreloadHiddenArg(
3046 *OrigArg->getParent(),
3047 "hidden argument in kernel signature was not preloaded",
3048 DL.getDebugLoc());
3049 DAG.getContext()->diagnose(NonPreloadHiddenArg);
3050 }
3051 }
3052
3053 NewArg =
3054 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3055 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3056 }
3057 Chains.push_back(NewArg.getValue(1));
3058
3059 auto *ParamTy =
3060 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3062 ParamTy &&
3063 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3064 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3065 // On SI local pointers are just offsets into LDS, so they are always
3066 // less than 16-bits. On CI and newer they could potentially be
3067 // real pointers, so we can't guarantee their size.
3068 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3069 DAG.getValueType(MVT::i16));
3070 }
3071
3072 InVals.push_back(NewArg);
3073 continue;
3074 }
3075 if (!IsEntryFunc && VA.isMemLoc()) {
3076 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3077 InVals.push_back(Val);
3078 if (!Arg.Flags.isByVal())
3079 Chains.push_back(Val.getValue(1));
3080 continue;
3081 }
3082
3083 assert(VA.isRegLoc() && "Parameter must be in a register!");
3084
3085 Register Reg = VA.getLocReg();
3086 const TargetRegisterClass *RC = nullptr;
3087 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3088 RC = &AMDGPU::VGPR_32RegClass;
3089 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3090 RC = &AMDGPU::SGPR_32RegClass;
3091 else
3092 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3093 EVT ValVT = VA.getValVT();
3094
3095 Reg = MF.addLiveIn(Reg, RC);
3096 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3097
3098 if (Arg.Flags.isSRet()) {
3099 // The return object should be reasonably addressable.
3100
3101 // FIXME: This helps when the return is a real sret. If it is a
3102 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3103 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3104 unsigned NumBits =
3106 Val = DAG.getNode(
3107 ISD::AssertZext, DL, VT, Val,
3108 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3109 }
3110
3111 // If this is an 8 or 16-bit value, it is really passed promoted
3112 // to 32 bits. Insert an assert[sz]ext to capture this, then
3113 // truncate to the right size.
3114 switch (VA.getLocInfo()) {
3115 case CCValAssign::Full:
3116 break;
3117 case CCValAssign::BCvt:
3118 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3119 break;
3120 case CCValAssign::SExt:
3121 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3122 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3123 break;
3124 case CCValAssign::ZExt:
3125 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3126 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3127 break;
3128 case CCValAssign::AExt:
3129 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3130 break;
3131 default:
3132 llvm_unreachable("Unknown loc info!");
3133 }
3134
3135 InVals.push_back(Val);
3136 }
3137
3138 // Start adding system SGPRs.
3139 if (IsEntryFunc)
3140 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3141
3142 // DAG.getPass() returns nullptr when using new pass manager.
3143 // TODO: Use DAG.getMFAM() to access analysis result.
3144 if (DAG.getPass()) {
3145 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3146 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3147 }
3148
3149 unsigned StackArgSize = CCInfo.getStackSize();
3150 Info->setBytesInStackArgArea(StackArgSize);
3151
3152 return Chains.empty() ? Chain
3153 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3154}
3155
3156// TODO: If return values can't fit in registers, we should return as many as
3157// possible in registers before passing on stack.
3159 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3160 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3161 const Type *RetTy) const {
3162 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3163 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3164 // for shaders. Vector types should be explicitly handled by CC.
3165 if (AMDGPU::isEntryFunctionCC(CallConv))
3166 return true;
3167
3169 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3170 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3171 return false;
3172
3173 // We must use the stack if return would require unavailable registers.
3174 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3175 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3176 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3177 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3178 return false;
3179
3180 return true;
3181}
3182
3183SDValue
3185 bool isVarArg,
3187 const SmallVectorImpl<SDValue> &OutVals,
3188 const SDLoc &DL, SelectionDAG &DAG) const {
3191
3192 if (AMDGPU::isKernel(CallConv)) {
3193 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3194 OutVals, DL, DAG);
3195 }
3196
3197 bool IsShader = AMDGPU::isShader(CallConv);
3198
3199 Info->setIfReturnsVoid(Outs.empty());
3200 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3201
3202 // CCValAssign - represent the assignment of the return value to a location.
3205
3206 // CCState - Info about the registers and stack slots.
3207 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3208 *DAG.getContext());
3209
3210 // Analyze outgoing return values.
3211 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3212
3213 SDValue Glue;
3215 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3216
3217 // Copy the result values into the output registers.
3218 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3219 ++I, ++RealRVLocIdx) {
3220 CCValAssign &VA = RVLocs[I];
3221 assert(VA.isRegLoc() && "Can only return in registers!");
3222 // TODO: Partially return in registers if return values don't fit.
3223 SDValue Arg = OutVals[RealRVLocIdx];
3224
3225 // Copied from other backends.
3226 switch (VA.getLocInfo()) {
3227 case CCValAssign::Full:
3228 break;
3229 case CCValAssign::BCvt:
3230 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3231 break;
3232 case CCValAssign::SExt:
3233 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3234 break;
3235 case CCValAssign::ZExt:
3236 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3237 break;
3238 case CCValAssign::AExt:
3239 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3240 break;
3241 default:
3242 llvm_unreachable("Unknown loc info!");
3243 }
3244
3245 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3246 Glue = Chain.getValue(1);
3247 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3248 }
3249
3250 // FIXME: Does sret work properly?
3251 if (!Info->isEntryFunction()) {
3252 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3253 const MCPhysReg *I =
3254 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3255 if (I) {
3256 for (; *I; ++I) {
3257 if (AMDGPU::SReg_64RegClass.contains(*I))
3258 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3259 else if (AMDGPU::SReg_32RegClass.contains(*I))
3260 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3261 else
3262 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3263 }
3264 }
3265 }
3266
3267 // Update chain and glue.
3268 RetOps[0] = Chain;
3269 if (Glue.getNode())
3270 RetOps.push_back(Glue);
3271
3272 unsigned Opc = AMDGPUISD::ENDPGM;
3273 if (!IsWaveEnd)
3275 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3276}
3277
3279 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3280 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3281 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3282 SDValue ThisVal) const {
3283 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3284
3285 // Assign locations to each value returned by this call.
3287 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3288 *DAG.getContext());
3289 CCInfo.AnalyzeCallResult(Ins, RetCC);
3290
3291 // Copy all of the result registers out of their specified physreg.
3292 for (CCValAssign VA : RVLocs) {
3293 SDValue Val;
3294
3295 if (VA.isRegLoc()) {
3296 Val =
3297 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3298 Chain = Val.getValue(1);
3299 InGlue = Val.getValue(2);
3300 } else if (VA.isMemLoc()) {
3301 report_fatal_error("TODO: return values in memory");
3302 } else
3303 llvm_unreachable("unknown argument location type");
3304
3305 switch (VA.getLocInfo()) {
3306 case CCValAssign::Full:
3307 break;
3308 case CCValAssign::BCvt:
3309 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3310 break;
3311 case CCValAssign::ZExt:
3312 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3313 DAG.getValueType(VA.getValVT()));
3314 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3315 break;
3316 case CCValAssign::SExt:
3317 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3318 DAG.getValueType(VA.getValVT()));
3319 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3320 break;
3321 case CCValAssign::AExt:
3322 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3323 break;
3324 default:
3325 llvm_unreachable("Unknown loc info!");
3326 }
3327
3328 InVals.push_back(Val);
3329 }
3330
3331 return Chain;
3332}
3333
3334// Add code to pass special inputs required depending on used features separate
3335// from the explicit user arguments present in the IR.
3337 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3338 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3339 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3340 // If we don't have a call site, this was a call inserted by
3341 // legalization. These can never use special inputs.
3342 if (!CLI.CB)
3343 return;
3344
3345 SelectionDAG &DAG = CLI.DAG;
3346 const SDLoc &DL = CLI.DL;
3347 const Function &F = DAG.getMachineFunction().getFunction();
3348
3349 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3350 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3351
3352 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3354 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3355 // DAG.getPass() returns nullptr when using new pass manager.
3356 // TODO: Use DAG.getMFAM() to access analysis result.
3357 if (DAG.getPass()) {
3358 auto &ArgUsageInfo =
3360 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3361 }
3362 }
3363
3364 // TODO: Unify with private memory register handling. This is complicated by
3365 // the fact that at least in kernels, the input argument is not necessarily
3366 // in the same location as the input.
3367 // clang-format off
3368 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3370 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3371 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3372 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3373 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3374 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3375 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3376 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3377 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3378 };
3379 // clang-format on
3380
3381 for (auto [InputID, Attr] : ImplicitAttrs) {
3382 // If the callee does not use the attribute value, skip copying the value.
3383 if (CLI.CB->hasFnAttr(Attr))
3384 continue;
3385
3386 const auto [OutgoingArg, ArgRC, ArgTy] =
3387 CalleeArgInfo->getPreloadedValue(InputID);
3388 if (!OutgoingArg)
3389 continue;
3390
3391 const auto [IncomingArg, IncomingArgRC, Ty] =
3392 CallerArgInfo.getPreloadedValue(InputID);
3393 assert(IncomingArgRC == ArgRC);
3394
3395 // All special arguments are ints for now.
3396 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3397 SDValue InputReg;
3398
3399 if (IncomingArg) {
3400 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3401 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3402 // The implicit arg ptr is special because it doesn't have a corresponding
3403 // input for kernels, and is computed from the kernarg segment pointer.
3404 InputReg = getImplicitArgPtr(DAG, DL);
3405 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3406 std::optional<uint32_t> Id =
3408 if (Id.has_value()) {
3409 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3410 } else {
3411 InputReg = DAG.getUNDEF(ArgVT);
3412 }
3413 } else {
3414 // We may have proven the input wasn't needed, although the ABI is
3415 // requiring it. We just need to allocate the register appropriately.
3416 InputReg = DAG.getUNDEF(ArgVT);
3417 }
3418
3419 if (OutgoingArg->isRegister()) {
3420 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3421 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3422 report_fatal_error("failed to allocate implicit input argument");
3423 } else {
3424 unsigned SpecialArgOffset =
3425 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3426 SDValue ArgStore =
3427 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3428 MemOpChains.push_back(ArgStore);
3429 }
3430 }
3431
3432 // Pack workitem IDs into a single register or pass it as is if already
3433 // packed.
3434
3435 auto [OutgoingArg, ArgRC, Ty] =
3437 if (!OutgoingArg)
3438 std::tie(OutgoingArg, ArgRC, Ty) =
3440 if (!OutgoingArg)
3441 std::tie(OutgoingArg, ArgRC, Ty) =
3443 if (!OutgoingArg)
3444 return;
3445
3446 const ArgDescriptor *IncomingArgX = std::get<0>(
3448 const ArgDescriptor *IncomingArgY = std::get<0>(
3450 const ArgDescriptor *IncomingArgZ = std::get<0>(
3452
3453 SDValue InputReg;
3454 SDLoc SL;
3455
3456 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3457 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3458 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3459
3460 // If incoming ids are not packed we need to pack them.
3461 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3462 NeedWorkItemIDX) {
3463 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3464 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3465 } else {
3466 InputReg = DAG.getConstant(0, DL, MVT::i32);
3467 }
3468 }
3469
3470 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3471 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3472 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3473 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3474 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3475 InputReg = InputReg.getNode()
3476 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3477 : Y;
3478 }
3479
3480 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3481 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3482 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3483 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3484 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3485 InputReg = InputReg.getNode()
3486 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3487 : Z;
3488 }
3489
3490 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3491 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3492 // We're in a situation where the outgoing function requires the workitem
3493 // ID, but the calling function does not have it (e.g a graphics function
3494 // calling a C calling convention function). This is illegal, but we need
3495 // to produce something.
3496 InputReg = DAG.getUNDEF(MVT::i32);
3497 } else {
3498 // Workitem ids are already packed, any of present incoming arguments
3499 // will carry all required fields.
3500 ArgDescriptor IncomingArg =
3501 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3502 : IncomingArgY ? *IncomingArgY
3503 : *IncomingArgZ,
3504 ~0u);
3505 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3506 }
3507 }
3508
3509 if (OutgoingArg->isRegister()) {
3510 if (InputReg)
3511 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3512
3513 CCInfo.AllocateReg(OutgoingArg->getRegister());
3514 } else {
3515 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3516 if (InputReg) {
3517 SDValue ArgStore =
3518 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3519 MemOpChains.push_back(ArgStore);
3520 }
3521 }
3522}
3523
3525 return CC == CallingConv::Fast;
3526}
3527
3528/// Return true if we might ever do TCO for calls with this calling convention.
3530 switch (CC) {
3531 case CallingConv::C:
3533 return true;
3534 default:
3535 return canGuaranteeTCO(CC);
3536 }
3537}
3538
3540 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3542 const SmallVectorImpl<SDValue> &OutVals,
3543 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3544 if (AMDGPU::isChainCC(CalleeCC))
3545 return true;
3546
3547 if (!mayTailCallThisCC(CalleeCC))
3548 return false;
3549
3550 // For a divergent call target, we need to do a waterfall loop over the
3551 // possible callees which precludes us from using a simple jump.
3552 if (Callee->isDivergent())
3553 return false;
3554
3556 const Function &CallerF = MF.getFunction();
3557 CallingConv::ID CallerCC = CallerF.getCallingConv();
3559 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3560
3561 // Kernels aren't callable, and don't have a live in return address so it
3562 // doesn't make sense to do a tail call with entry functions.
3563 if (!CallerPreserved)
3564 return false;
3565
3566 bool CCMatch = CallerCC == CalleeCC;
3567
3569 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3570 return true;
3571 return false;
3572 }
3573
3574 // TODO: Can we handle var args?
3575 if (IsVarArg)
3576 return false;
3577
3578 for (const Argument &Arg : CallerF.args()) {
3579 if (Arg.hasByValAttr())
3580 return false;
3581 }
3582
3583 LLVMContext &Ctx = *DAG.getContext();
3584
3585 // Check that the call results are passed in the same way.
3586 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3587 CCAssignFnForCall(CalleeCC, IsVarArg),
3588 CCAssignFnForCall(CallerCC, IsVarArg)))
3589 return false;
3590
3591 // The callee has to preserve all registers the caller needs to preserve.
3592 if (!CCMatch) {
3593 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3594 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3595 return false;
3596 }
3597
3598 // Nothing more to check if the callee is taking no arguments.
3599 if (Outs.empty())
3600 return true;
3601
3603 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3604
3605 // FIXME: We are not allocating special input registers, so we will be
3606 // deciding based on incorrect register assignments.
3607 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3608
3609 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3610 // If the stack arguments for this call do not fit into our own save area then
3611 // the call cannot be made tail.
3612 // TODO: Is this really necessary?
3613 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3614 return false;
3615
3616 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3617 // FIXME: What about inreg arguments that end up passed in memory?
3618 if (!CCVA.isRegLoc())
3619 continue;
3620
3621 // If we are passing an argument in an SGPR, and the value is divergent,
3622 // this call requires a waterfall loop.
3623 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3624 LLVM_DEBUG(
3625 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3626 << printReg(CCVA.getLocReg(), TRI) << '\n');
3627 return false;
3628 }
3629 }
3630
3631 const MachineRegisterInfo &MRI = MF.getRegInfo();
3632 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3633}
3634
3636 if (!CI->isTailCall())
3637 return false;
3638
3639 const Function *ParentFn = CI->getParent()->getParent();
3641 return false;
3642 return true;
3643}
3644
3645// The wave scratch offset register is used as the global base pointer.
3647 SmallVectorImpl<SDValue> &InVals) const {
3648 CallingConv::ID CallConv = CLI.CallConv;
3649 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3650
3651 SelectionDAG &DAG = CLI.DAG;
3652
3653 TargetLowering::ArgListEntry RequestedExec;
3654 if (IsChainCallConv) {
3655 // The last argument should be the value that we need to put in EXEC.
3656 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3657 // don't treat it like the rest of the arguments.
3658 RequestedExec = CLI.Args.back();
3659 assert(RequestedExec.Node && "No node for EXEC");
3660
3661 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3662 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3663
3664 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3665 CLI.Outs.pop_back();
3666 CLI.OutVals.pop_back();
3667
3668 if (RequestedExec.Ty->isIntegerTy(64)) {
3669 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3670 CLI.Outs.pop_back();
3671 CLI.OutVals.pop_back();
3672 }
3673
3674 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3675 "Haven't popped all the pieces of the EXEC mask");
3676 }
3677
3678 const SDLoc &DL = CLI.DL;
3680 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3682 SDValue Chain = CLI.Chain;
3683 SDValue Callee = CLI.Callee;
3684 bool &IsTailCall = CLI.IsTailCall;
3685 bool IsVarArg = CLI.IsVarArg;
3686 bool IsSibCall = false;
3688
3689 if (Callee.isUndef() || isNullConstant(Callee)) {
3690 if (!CLI.IsTailCall) {
3691 for (ISD::InputArg &Arg : CLI.Ins)
3692 InVals.push_back(DAG.getUNDEF(Arg.VT));
3693 }
3694
3695 return Chain;
3696 }
3697
3698 if (IsVarArg) {
3699 return lowerUnhandledCall(CLI, InVals,
3700 "unsupported call to variadic function ");
3701 }
3702
3703 if (!CLI.CB)
3704 report_fatal_error("unsupported libcall legalization");
3705
3706 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3707 return lowerUnhandledCall(CLI, InVals,
3708 "unsupported required tail call to function ");
3709 }
3710
3711 if (IsTailCall) {
3712 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3713 Outs, OutVals, Ins, DAG);
3714 if (!IsTailCall &&
3715 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3716 report_fatal_error("failed to perform tail call elimination on a call "
3717 "site marked musttail or on llvm.amdgcn.cs.chain");
3718 }
3719
3720 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3721
3722 // A sibling call is one where we're under the usual C ABI and not planning
3723 // to change that but can still do a tail call:
3724 if (!TailCallOpt && IsTailCall)
3725 IsSibCall = true;
3726
3727 if (IsTailCall)
3728 ++NumTailCalls;
3729 }
3730
3733 SmallVector<SDValue, 8> MemOpChains;
3734
3735 // Analyze operands of the call, assigning locations to each operand.
3737 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3738 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3739
3740 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3741 // With a fixed ABI, allocate fixed registers before user arguments.
3742 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3743 }
3744
3745 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3746
3747 // Get a count of how many bytes are to be pushed on the stack.
3748 unsigned NumBytes = CCInfo.getStackSize();
3749
3750 if (IsSibCall) {
3751 // Since we're not changing the ABI to make this a tail call, the memory
3752 // operands are already available in the caller's incoming argument space.
3753 NumBytes = 0;
3754 }
3755
3756 // FPDiff is the byte offset of the call's argument area from the callee's.
3757 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3758 // by this amount for a tail call. In a sibling call it must be 0 because the
3759 // caller will deallocate the entire stack and the callee still expects its
3760 // arguments to begin at SP+0. Completely unused for non-tail calls.
3761 int32_t FPDiff = 0;
3762 MachineFrameInfo &MFI = MF.getFrameInfo();
3763 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3764
3765 // Adjust the stack pointer for the new arguments...
3766 // These operations are automatically eliminated by the prolog/epilog pass
3767 if (!IsSibCall)
3768 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3769
3770 if (!IsSibCall || IsChainCallConv) {
3771 if (!Subtarget->enableFlatScratch()) {
3772 SmallVector<SDValue, 4> CopyFromChains;
3773
3774 // In the HSA case, this should be an identity copy.
3775 SDValue ScratchRSrcReg =
3776 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3777 RegsToPass.emplace_back(IsChainCallConv
3778 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3779 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3780 ScratchRSrcReg);
3781 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3782 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3783 }
3784 }
3785
3786 const unsigned NumSpecialInputs = RegsToPass.size();
3787
3788 MVT PtrVT = MVT::i32;
3789
3790 // Walk the register/memloc assignments, inserting copies/loads.
3791 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3792 CCValAssign &VA = ArgLocs[i];
3793 SDValue Arg = OutVals[i];
3794
3795 // Promote the value if needed.
3796 switch (VA.getLocInfo()) {
3797 case CCValAssign::Full:
3798 break;
3799 case CCValAssign::BCvt:
3800 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3801 break;
3802 case CCValAssign::ZExt:
3803 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3804 break;
3805 case CCValAssign::SExt:
3806 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3807 break;
3808 case CCValAssign::AExt:
3809 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3810 break;
3811 case CCValAssign::FPExt:
3812 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3813 break;
3814 default:
3815 llvm_unreachable("Unknown loc info!");
3816 }
3817
3818 if (VA.isRegLoc()) {
3819 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3820 } else {
3821 assert(VA.isMemLoc());
3822
3823 SDValue DstAddr;
3824 MachinePointerInfo DstInfo;
3825
3826 unsigned LocMemOffset = VA.getLocMemOffset();
3827 int32_t Offset = LocMemOffset;
3828
3829 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3830 MaybeAlign Alignment;
3831
3832 if (IsTailCall) {
3833 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3834 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3835 : VA.getValVT().getStoreSize();
3836
3837 // FIXME: We can have better than the minimum byval required alignment.
3838 Alignment =
3839 Flags.isByVal()
3840 ? Flags.getNonZeroByValAlign()
3841 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3842
3843 Offset = Offset + FPDiff;
3844 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3845
3846 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3847 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3848
3849 // Make sure any stack arguments overlapping with where we're storing
3850 // are loaded before this eventual operation. Otherwise they'll be
3851 // clobbered.
3852
3853 // FIXME: Why is this really necessary? This seems to just result in a
3854 // lot of code to copy the stack and write them back to the same
3855 // locations, which are supposed to be immutable?
3856 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3857 } else {
3858 // Stores to the argument stack area are relative to the stack pointer.
3859 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3860 MVT::i32);
3861 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3862 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3863 Alignment =
3864 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3865 }
3866
3867 if (Outs[i].Flags.isByVal()) {
3868 SDValue SizeNode =
3869 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3870 SDValue Cpy =
3871 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3872 Outs[i].Flags.getNonZeroByValAlign(),
3873 /*isVol = */ false, /*AlwaysInline = */ true,
3874 /*CI=*/nullptr, std::nullopt, DstInfo,
3876
3877 MemOpChains.push_back(Cpy);
3878 } else {
3879 SDValue Store =
3880 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3881 MemOpChains.push_back(Store);
3882 }
3883 }
3884 }
3885
3886 if (!MemOpChains.empty())
3887 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3888
3889 SDValue ReadFirstLaneID =
3890 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3891
3892 SDValue TokenGlue;
3893 if (CLI.ConvergenceControlToken) {
3894 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3896 }
3897
3898 // Build a sequence of copy-to-reg nodes chained together with token chain
3899 // and flag operands which copy the outgoing args into the appropriate regs.
3900 SDValue InGlue;
3901
3902 unsigned ArgIdx = 0;
3903 for (auto [Reg, Val] : RegsToPass) {
3904 if (ArgIdx++ >= NumSpecialInputs &&
3905 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3906 // For chain calls, the inreg arguments are required to be
3907 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3908 // they are uniform.
3909 //
3910 // For other calls, if an inreg arguments is known to be uniform,
3911 // speculatively insert a readfirstlane in case it is in a VGPR.
3912 //
3913 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3914 // value, so let that continue to produce invalid code.
3915
3916 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3917 if (TokenGlue)
3918 ReadfirstlaneArgs.push_back(TokenGlue);
3920 ReadfirstlaneArgs);
3921 }
3922
3923 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3924 InGlue = Chain.getValue(1);
3925 }
3926
3927 // We don't usually want to end the call-sequence here because we would tidy
3928 // the frame up *after* the call, however in the ABI-changing tail-call case
3929 // we've carefully laid out the parameters so that when sp is reset they'll be
3930 // in the correct location.
3931 if (IsTailCall && !IsSibCall) {
3932 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3933 InGlue = Chain.getValue(1);
3934 }
3935
3936 std::vector<SDValue> Ops({Chain});
3937
3938 // Add a redundant copy of the callee global which will not be legalized, as
3939 // we need direct access to the callee later.
3940 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3941 const GlobalValue *GV = GSD->getGlobal();
3942 Ops.push_back(Callee);
3943 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3944 } else {
3945 if (IsTailCall) {
3946 // isEligibleForTailCallOptimization considered whether the call target is
3947 // divergent, but we may still end up with a uniform value in a VGPR.
3948 // Insert a readfirstlane just in case.
3949 SDValue ReadFirstLaneID =
3950 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3951
3952 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3953 if (TokenGlue)
3954 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3955 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3956 ReadfirstlaneArgs);
3957 }
3958
3959 Ops.push_back(Callee);
3960 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3961 }
3962
3963 if (IsTailCall) {
3964 // Each tail call may have to adjust the stack by a different amount, so
3965 // this information must travel along with the operation for eventual
3966 // consumption by emitEpilogue.
3967 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3968 }
3969
3970 if (IsChainCallConv)
3971 Ops.push_back(RequestedExec.Node);
3972
3973 // Add argument registers to the end of the list so that they are known live
3974 // into the call.
3975 for (auto &[Reg, Val] : RegsToPass)
3976 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3977
3978 // Add a register mask operand representing the call-preserved registers.
3979 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3980 assert(Mask && "Missing call preserved mask for calling convention");
3981 Ops.push_back(DAG.getRegisterMask(Mask));
3982
3983 if (SDValue Token = CLI.ConvergenceControlToken) {
3985 GlueOps.push_back(Token);
3986 if (InGlue)
3987 GlueOps.push_back(InGlue);
3988
3989 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3990 MVT::Glue, GlueOps),
3991 0);
3992 }
3993
3994 if (InGlue)
3995 Ops.push_back(InGlue);
3996
3997 // If we're doing a tall call, use a TC_RETURN here rather than an
3998 // actual call instruction.
3999 if (IsTailCall) {
4000 MFI.setHasTailCall();
4001 unsigned OPC = AMDGPUISD::TC_RETURN;
4002 switch (CallConv) {
4005 break;
4009 break;
4010 }
4011
4012 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4013 }
4014
4015 // Returns a chain and a flag for retval copy to use.
4016 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4017 Chain = Call.getValue(0);
4018 InGlue = Call.getValue(1);
4019
4020 uint64_t CalleePopBytes = NumBytes;
4021 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4022 if (!Ins.empty())
4023 InGlue = Chain.getValue(1);
4024
4025 // Handle result values, copying them out of physregs into vregs that we
4026 // return.
4027 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4028 InVals, /*IsThisReturn=*/false, SDValue());
4029}
4030
4031// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4032// except for:
4033// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4034// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4036 SelectionDAG &DAG) const {
4037 const MachineFunction &MF = DAG.getMachineFunction();
4039
4040 SDLoc dl(Op);
4041 EVT VT = Op.getValueType();
4042 SDValue Chain = Op.getOperand(0);
4043 Register SPReg = Info->getStackPtrOffsetReg();
4044
4045 // Chain the dynamic stack allocation so that it doesn't modify the stack
4046 // pointer when other instructions are using the stack.
4047 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4048
4049 SDValue Size = Op.getOperand(1);
4050 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4051 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4052
4053 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4055 "Stack grows upwards for AMDGPU");
4056
4057 Chain = BaseAddr.getValue(1);
4058 Align StackAlign = TFL->getStackAlign();
4059 if (Alignment > StackAlign) {
4060 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4061 << Subtarget->getWavefrontSizeLog2();
4062 uint64_t StackAlignMask = ScaledAlignment - 1;
4063 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4064 DAG.getConstant(StackAlignMask, dl, VT));
4065 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4066 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4067 }
4068
4069 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4070 SDValue NewSP;
4071 if (isa<ConstantSDNode>(Size)) {
4072 // For constant sized alloca, scale alloca size by wave-size
4073 SDValue ScaledSize = DAG.getNode(
4074 ISD::SHL, dl, VT, Size,
4075 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4076 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4077 } else {
4078 // For dynamic sized alloca, perform wave-wide reduction to get max of
4079 // alloca size(divergent) and then scale it by wave-size
4080 SDValue WaveReduction =
4081 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4082 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4083 Size, DAG.getConstant(0, dl, MVT::i32));
4084 SDValue ScaledSize = DAG.getNode(
4085 ISD::SHL, dl, VT, Size,
4086 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4087 NewSP =
4088 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4089 SDValue ReadFirstLaneID =
4090 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4091 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4092 NewSP);
4093 }
4094
4095 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4096 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4097
4098 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4099}
4100
4102 if (Op.getValueType() != MVT::i32)
4103 return Op; // Defer to cannot select error.
4104
4106 SDLoc SL(Op);
4107
4108 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4109
4110 // Convert from wave uniform to swizzled vector address. This should protect
4111 // from any edge cases where the stacksave result isn't directly used with
4112 // stackrestore.
4113 SDValue VectorAddress =
4114 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4115 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4116}
4117
4119 SelectionDAG &DAG) const {
4120 SDLoc SL(Op);
4121 assert(Op.getValueType() == MVT::i32);
4122
4123 uint32_t BothRoundHwReg =
4125 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4126
4127 SDValue IntrinID =
4128 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4129 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4130 Op.getOperand(0), IntrinID, GetRoundBothImm);
4131
4132 // There are two rounding modes, one for f32 and one for f64/f16. We only
4133 // report in the standard value range if both are the same.
4134 //
4135 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4136 // ties away from zero is not supported, and the other values are rotated by
4137 // 1.
4138 //
4139 // If the two rounding modes are not the same, report a target defined value.
4140
4141 // Mode register rounding mode fields:
4142 //
4143 // [1:0] Single-precision round mode.
4144 // [3:2] Double/Half-precision round mode.
4145 //
4146 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4147 //
4148 // Hardware Spec
4149 // Toward-0 3 0
4150 // Nearest Even 0 1
4151 // +Inf 1 2
4152 // -Inf 2 3
4153 // NearestAway0 N/A 4
4154 //
4155 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4156 // table we can index by the raw hardware mode.
4157 //
4158 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4159
4160 SDValue BitTable =
4162
4163 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4164 SDValue RoundModeTimesNumBits =
4165 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4166
4167 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4168 // knew only one mode was demanded.
4169 SDValue TableValue =
4170 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4171 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4172
4173 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4174 SDValue TableEntry =
4175 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4176
4177 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4178 // if it's an extended value.
4179 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4180 SDValue IsStandardValue =
4181 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4182 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4183 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4184 TableEntry, EnumOffset);
4185
4186 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4187}
4188
4190 SelectionDAG &DAG) const {
4191 SDLoc SL(Op);
4192
4193 SDValue NewMode = Op.getOperand(1);
4194 assert(NewMode.getValueType() == MVT::i32);
4195
4196 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4197 // hardware MODE.fp_round values.
4198 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4199 uint32_t ClampedVal = std::min(
4200 static_cast<uint32_t>(ConstMode->getZExtValue()),
4202 NewMode = DAG.getConstant(
4203 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4204 } else {
4205 // If we know the input can only be one of the supported standard modes in
4206 // the range 0-3, we can use a simplified mapping to hardware values.
4207 KnownBits KB = DAG.computeKnownBits(NewMode);
4208 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4209 // The supported standard values are 0-3. The extended values start at 8. We
4210 // need to offset by 4 if the value is in the extended range.
4211
4212 if (UseReducedTable) {
4213 // Truncate to the low 32-bits.
4214 SDValue BitTable = DAG.getConstant(
4215 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4216
4217 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4218 SDValue RoundModeTimesNumBits =
4219 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4220
4221 NewMode =
4222 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4223
4224 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4225 // the table extracted bits into inline immediates.
4226 } else {
4227 // table_index = umin(value, value - 4)
4228 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4229 SDValue BitTable =
4231
4232 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4233 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4234 SDValue IndexVal =
4235 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4236
4237 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4238 SDValue RoundModeTimesNumBits =
4239 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4240
4241 SDValue TableValue =
4242 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4243 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4244
4245 // No need to mask out the high bits since the setreg will ignore them
4246 // anyway.
4247 NewMode = TruncTable;
4248 }
4249
4250 // Insert a readfirstlane in case the value is a VGPR. We could do this
4251 // earlier and keep more operations scalar, but that interferes with
4252 // combining the source.
4253 SDValue ReadFirstLaneID =
4254 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4255 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4256 ReadFirstLaneID, NewMode);
4257 }
4258
4259 // N.B. The setreg will be later folded into s_round_mode on supported
4260 // targets.
4261 SDValue IntrinID =
4262 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4263 uint32_t BothRoundHwReg =
4265 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4266
4267 SDValue SetReg =
4268 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4269 IntrinID, RoundBothImm, NewMode);
4270
4271 return SetReg;
4272}
4273
4275 if (Op->isDivergent())
4276 return SDValue();
4277
4278 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4283 break;
4284 default:
4285 return SDValue();
4286 }
4287
4288 return Op;
4289}
4290
4291// Work around DAG legality rules only based on the result type.
4293 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4294 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4295 EVT SrcVT = Src.getValueType();
4296
4297 if (SrcVT.getScalarType() != MVT::bf16)
4298 return Op;
4299
4300 SDLoc SL(Op);
4301 SDValue BitCast =
4302 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4303
4304 EVT DstVT = Op.getValueType();
4305 if (IsStrict)
4306 llvm_unreachable("Need STRICT_BF16_TO_FP");
4307
4308 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4309}
4310
4312 SDLoc SL(Op);
4313 if (Op.getValueType() != MVT::i64)
4314 return Op;
4315
4316 uint32_t ModeHwReg =
4318 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4319 uint32_t TrapHwReg =
4321 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4322
4323 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4324 SDValue IntrinID =
4325 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4326 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4327 Op.getOperand(0), IntrinID, ModeHwRegImm);
4328 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4329 Op.getOperand(0), IntrinID, TrapHwRegImm);
4330 SDValue TokenReg =
4331 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4332 GetTrapReg.getValue(1));
4333
4334 SDValue CvtPtr =
4335 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4336 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4337
4338 return DAG.getMergeValues({Result, TokenReg}, SL);
4339}
4340
4342 SDLoc SL(Op);
4343 if (Op.getOperand(1).getValueType() != MVT::i64)
4344 return Op;
4345
4346 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4347 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4348 DAG.getConstant(0, SL, MVT::i32));
4349 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4350 DAG.getConstant(1, SL, MVT::i32));
4351
4352 SDValue ReadFirstLaneID =
4353 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4354 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4355 ReadFirstLaneID, NewModeReg);
4356 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4357 ReadFirstLaneID, NewTrapReg);
4358
4359 unsigned ModeHwReg =
4361 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4362 unsigned TrapHwReg =
4364 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4365
4366 SDValue IntrinID =
4367 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4368 SDValue SetModeReg =
4369 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4370 IntrinID, ModeHwRegImm, NewModeReg);
4371 SDValue SetTrapReg =
4372 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4373 IntrinID, TrapHwRegImm, NewTrapReg);
4374 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4375}
4376
4378 const MachineFunction &MF) const {
4380 .Case("m0", AMDGPU::M0)
4381 .Case("exec", AMDGPU::EXEC)
4382 .Case("exec_lo", AMDGPU::EXEC_LO)
4383 .Case("exec_hi", AMDGPU::EXEC_HI)
4384 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4385 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4386 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4387 .Default(Register());
4388
4389 if (Reg == AMDGPU::NoRegister) {
4391 Twine("invalid register name \"" + StringRef(RegName) + "\"."));
4392 }
4393
4394 if (!Subtarget->hasFlatScrRegister() &&
4395 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4396 report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
4397 "\" for subtarget."));
4398 }
4399
4400 switch (Reg) {
4401 case AMDGPU::M0:
4402 case AMDGPU::EXEC_LO:
4403 case AMDGPU::EXEC_HI:
4404 case AMDGPU::FLAT_SCR_LO:
4405 case AMDGPU::FLAT_SCR_HI:
4406 if (VT.getSizeInBits() == 32)
4407 return Reg;
4408 break;
4409 case AMDGPU::EXEC:
4410 case AMDGPU::FLAT_SCR:
4411 if (VT.getSizeInBits() == 64)
4412 return Reg;
4413 break;
4414 default:
4415 llvm_unreachable("missing register type checking");
4416 }
4417
4419 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4420}
4421
4422// If kill is not the last instruction, split the block so kill is always a
4423// proper terminator.
4426 MachineBasicBlock *BB) const {
4427 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4429 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4430 return SplitBB;
4431}
4432
4433// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4434// \p MI will be the only instruction in the loop body block. Otherwise, it will
4435// be the first instruction in the remainder block.
4436//
4437/// \returns { LoopBody, Remainder }
4438static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4442
4443 // To insert the loop we need to split the block. Move everything after this
4444 // point to a new block, and insert a new empty block between the two.
4446 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4448 ++MBBI;
4449
4450 MF->insert(MBBI, LoopBB);
4451 MF->insert(MBBI, RemainderBB);
4452
4453 LoopBB->addSuccessor(LoopBB);
4454 LoopBB->addSuccessor(RemainderBB);
4455
4456 // Move the rest of the block into a new block.
4457 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4458
4459 if (InstInLoop) {
4460 auto Next = std::next(I);
4461
4462 // Move instruction to loop body.
4463 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4464
4465 // Move the rest of the block.
4466 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4467 } else {
4468 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4469 }
4470
4471 MBB.addSuccessor(LoopBB);
4472
4473 return std::pair(LoopBB, RemainderBB);
4474}
4475
4476/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4478 MachineBasicBlock *MBB = MI.getParent();
4480 auto I = MI.getIterator();
4481 auto E = std::next(I);
4482
4483 // clang-format off
4484 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4485 .addImm(0);
4486 // clang-format on
4487
4488 MIBundleBuilder Bundler(*MBB, I, E);
4489 finalizeBundle(*MBB, Bundler.begin());
4490}
4491
4494 MachineBasicBlock *BB) const {
4495 const DebugLoc &DL = MI.getDebugLoc();
4496
4498
4500
4501 // Apparently kill flags are only valid if the def is in the same block?
4502 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4503 Src->setIsKill(false);
4504
4505 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4506
4507 MachineBasicBlock::iterator I = LoopBB->end();
4508
4509 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4511
4512 // Clear TRAP_STS.MEM_VIOL
4513 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4514 .addImm(0)
4515 .addImm(EncodedReg);
4516
4518
4519 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4520
4521 // Load and check TRAP_STS.MEM_VIOL
4522 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4523 .addImm(EncodedReg);
4524
4525 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4526 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4527 .addReg(Reg, RegState::Kill)
4528 .addImm(0);
4529 // clang-format off
4530 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4531 .addMBB(LoopBB);
4532 // clang-format on
4533
4534 return RemainderBB;
4535}
4536
4537// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4538// wavefront. If the value is uniform and just happens to be in a VGPR, this
4539// will only do one iteration. In the worst case, this will loop 64 times.
4540//
4541// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4544 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4545 const DebugLoc &DL, const MachineOperand &Idx,
4546 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4547 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4548 Register &SGPRIdxReg) {
4549
4550 MachineFunction *MF = OrigBB.getParent();
4551 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4552 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4554
4555 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4556 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4557 Register NewExec = MRI.createVirtualRegister(BoolRC);
4558 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4559 Register CondReg = MRI.createVirtualRegister(BoolRC);
4560
4561 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4562 .addReg(InitReg)
4563 .addMBB(&OrigBB)
4564 .addReg(ResultReg)
4565 .addMBB(&LoopBB);
4566
4567 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4568 .addReg(InitSaveExecReg)
4569 .addMBB(&OrigBB)
4570 .addReg(NewExec)
4571 .addMBB(&LoopBB);
4572
4573 // Read the next variant <- also loop target.
4574 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4575 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4576
4577 // Compare the just read M0 value to all possible Idx values.
4578 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4579 .addReg(CurrentIdxReg)
4580 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4581
4582 // Update EXEC, save the original EXEC value to VCC.
4583 BuildMI(LoopBB, I, DL,
4584 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4585 : AMDGPU::S_AND_SAVEEXEC_B64),
4586 NewExec)
4587 .addReg(CondReg, RegState::Kill);
4588
4589 MRI.setSimpleHint(NewExec, CondReg);
4590
4591 if (UseGPRIdxMode) {
4592 if (Offset == 0) {
4593 SGPRIdxReg = CurrentIdxReg;
4594 } else {
4595 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4596 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4597 .addReg(CurrentIdxReg, RegState::Kill)
4598 .addImm(Offset);
4599 }
4600 } else {
4601 // Move index from VCC into M0
4602 if (Offset == 0) {
4603 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4604 .addReg(CurrentIdxReg, RegState::Kill);
4605 } else {
4606 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4607 .addReg(CurrentIdxReg, RegState::Kill)
4608 .addImm(Offset);
4609 }
4610 }
4611
4612 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4613 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4614 MachineInstr *InsertPt =
4615 BuildMI(LoopBB, I, DL,
4616 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4617 : AMDGPU::S_XOR_B64_term),
4618 Exec)
4619 .addReg(Exec)
4620 .addReg(NewExec);
4621
4622 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4623 // s_cbranch_scc0?
4624
4625 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4626 // clang-format off
4627 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4628 .addMBB(&LoopBB);
4629 // clang-format on
4630
4631 return InsertPt->getIterator();
4632}
4633
4634// This has slightly sub-optimal regalloc when the source vector is killed by
4635// the read. The register allocator does not understand that the kill is
4636// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4637// subregister from it, using 1 more VGPR than necessary. This was saved when
4638// this was expanded after register allocation.
4641 unsigned InitResultReg, unsigned PhiReg, int Offset,
4642 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4644 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4645 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4647 const DebugLoc &DL = MI.getDebugLoc();
4649
4650 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4651 Register DstReg = MI.getOperand(0).getReg();
4652 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4653 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4654 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4655 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4656
4657 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4658
4659 // Save the EXEC mask
4660 // clang-format off
4661 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4662 .addReg(Exec);
4663 // clang-format on
4664
4665 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4666
4667 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4668
4669 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4670 InitResultReg, DstReg, PhiReg, TmpExec,
4671 Offset, UseGPRIdxMode, SGPRIdxReg);
4672
4673 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4675 ++MBBI;
4676 MF->insert(MBBI, LandingPad);
4677 LoopBB->removeSuccessor(RemainderBB);
4678 LandingPad->addSuccessor(RemainderBB);
4679 LoopBB->addSuccessor(LandingPad);
4680 MachineBasicBlock::iterator First = LandingPad->begin();
4681 // clang-format off
4682 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4683 .addReg(SaveExec);
4684 // clang-format on
4685
4686 return InsPt;
4687}
4688
4689// Returns subreg index, offset
4690static std::pair<unsigned, int>
4692 const TargetRegisterClass *SuperRC, unsigned VecReg,
4693 int Offset) {
4694 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4695
4696 // Skip out of bounds offsets, or else we would end up using an undefined
4697 // register.
4698 if (Offset >= NumElts || Offset < 0)
4699 return std::pair(AMDGPU::sub0, Offset);
4700
4701 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4702}
4703
4706 int Offset) {
4707 MachineBasicBlock *MBB = MI.getParent();
4708 const DebugLoc &DL = MI.getDebugLoc();
4710
4711 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4712
4713 assert(Idx->getReg() != AMDGPU::NoRegister);
4714
4715 if (Offset == 0) {
4716 // clang-format off
4717 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4718 .add(*Idx);
4719 // clang-format on
4720 } else {
4721 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4722 .add(*Idx)
4723 .addImm(Offset);
4724 }
4725}
4726
4729 int Offset) {
4730 MachineBasicBlock *MBB = MI.getParent();
4731 const DebugLoc &DL = MI.getDebugLoc();
4733
4734 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4735
4736 if (Offset == 0)
4737 return Idx->getReg();
4738
4739 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4740 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4741 .add(*Idx)
4742 .addImm(Offset);
4743 return Tmp;
4744}
4745
4748 const GCNSubtarget &ST) {
4749 const SIInstrInfo *TII = ST.getInstrInfo();
4750 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4753
4754 Register Dst = MI.getOperand(0).getReg();
4755 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4756 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4757 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4758
4759 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4760 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4761
4762 unsigned SubReg;
4763 std::tie(SubReg, Offset) =
4764 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4765
4766 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4767
4768 // Check for a SGPR index.
4769 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4771 const DebugLoc &DL = MI.getDebugLoc();
4772
4773 if (UseGPRIdxMode) {
4774 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4775 // to avoid interfering with other uses, so probably requires a new
4776 // optimization pass.
4778
4779 const MCInstrDesc &GPRIDXDesc =
4780 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4781 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4782 .addReg(SrcReg)
4783 .addReg(Idx)
4784 .addImm(SubReg);
4785 } else {
4787
4788 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4789 .addReg(SrcReg, 0, SubReg)
4790 .addReg(SrcReg, RegState::Implicit);
4791 }
4792
4793 MI.eraseFromParent();
4794
4795 return &MBB;
4796 }
4797
4798 // Control flow needs to be inserted if indexing with a VGPR.
4799 const DebugLoc &DL = MI.getDebugLoc();
4801
4802 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4803 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4804
4805 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4806
4807 Register SGPRIdxReg;
4808 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4809 UseGPRIdxMode, SGPRIdxReg);
4810
4811 MachineBasicBlock *LoopBB = InsPt->getParent();
4812
4813 if (UseGPRIdxMode) {
4814 const MCInstrDesc &GPRIDXDesc =
4815 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4816
4817 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4818 .addReg(SrcReg)
4819 .addReg(SGPRIdxReg)
4820 .addImm(SubReg);
4821 } else {
4822 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4823 .addReg(SrcReg, 0, SubReg)
4824 .addReg(SrcReg, RegState::Implicit);
4825 }
4826
4827 MI.eraseFromParent();
4828
4829 return LoopBB;
4830}
4831
4834 const GCNSubtarget &ST) {
4835 const SIInstrInfo *TII = ST.getInstrInfo();
4836 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4839
4840 Register Dst = MI.getOperand(0).getReg();
4841 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4842 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4843 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4844 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4845 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4846 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4847
4848 // This can be an immediate, but will be folded later.
4849 assert(Val->getReg());
4850
4851 unsigned SubReg;
4852 std::tie(SubReg, Offset) =
4853 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4854 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4855
4856 if (Idx->getReg() == AMDGPU::NoRegister) {
4858 const DebugLoc &DL = MI.getDebugLoc();
4859
4860 assert(Offset == 0);
4861
4862 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4863 .add(*SrcVec)
4864 .add(*Val)
4865 .addImm(SubReg);
4866
4867 MI.eraseFromParent();
4868 return &MBB;
4869 }
4870
4871 // Check for a SGPR index.
4872 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4874 const DebugLoc &DL = MI.getDebugLoc();
4875
4876 if (UseGPRIdxMode) {
4878
4879 const MCInstrDesc &GPRIDXDesc =
4880 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4881 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4882 .addReg(SrcVec->getReg())
4883 .add(*Val)
4884 .addReg(Idx)
4885 .addImm(SubReg);
4886 } else {
4888
4889 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4890 TRI.getRegSizeInBits(*VecRC), 32, false);
4891 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4892 .addReg(SrcVec->getReg())
4893 .add(*Val)
4894 .addImm(SubReg);
4895 }
4896 MI.eraseFromParent();
4897 return &MBB;
4898 }
4899
4900 // Control flow needs to be inserted if indexing with a VGPR.
4901 if (Val->isReg())
4902 MRI.clearKillFlags(Val->getReg());
4903
4904 const DebugLoc &DL = MI.getDebugLoc();
4905
4906 Register PhiReg = MRI.createVirtualRegister(VecRC);
4907
4908 Register SGPRIdxReg;
4909 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4910 UseGPRIdxMode, SGPRIdxReg);
4911 MachineBasicBlock *LoopBB = InsPt->getParent();
4912
4913 if (UseGPRIdxMode) {
4914 const MCInstrDesc &GPRIDXDesc =
4915 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4916
4917 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4918 .addReg(PhiReg)
4919 .add(*Val)
4920 .addReg(SGPRIdxReg)
4921 .addImm(SubReg);
4922 } else {
4923 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4924 TRI.getRegSizeInBits(*VecRC), 32, false);
4925 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4926 .addReg(PhiReg)
4927 .add(*Val)
4928 .addImm(SubReg);
4929 }
4930
4931 MI.eraseFromParent();
4932 return LoopBB;
4933}
4934
4937 const GCNSubtarget &ST,
4938 unsigned Opc) {
4940 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4941 const DebugLoc &DL = MI.getDebugLoc();
4942 const SIInstrInfo *TII = ST.getInstrInfo();
4943
4944 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4945 Register SrcReg = MI.getOperand(1).getReg();
4946 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4947 Register DstReg = MI.getOperand(0).getReg();
4948 MachineBasicBlock *RetBB = nullptr;
4949 if (isSGPR) {
4950 // These operations with a uniform value i.e. SGPR are idempotent.
4951 // Reduced value will be same as given sgpr.
4952 // clang-format off
4953 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4954 .addReg(SrcReg);
4955 // clang-format on
4956 RetBB = &BB;
4957 } else {
4958 // TODO: Implement DPP Strategy and switch based on immediate strategy
4959 // operand. For now, for all the cases (default, Iterative and DPP we use
4960 // iterative approach by default.)
4961
4962 // To reduce the VGPR using iterative approach, we need to iterate
4963 // over all the active lanes. Lowering consists of ComputeLoop,
4964 // which iterate over only active lanes. We use copy of EXEC register
4965 // as induction variable and every active lane modifies it using bitset0
4966 // so that we will get the next active lane for next iteration.
4968 Register SrcReg = MI.getOperand(1).getReg();
4969
4970 // Create Control flow for loop
4971 // Split MI's Machine Basic block into For loop
4972 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4973
4974 // Create virtual registers required for lowering.
4975 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4976 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4977 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4978 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4979
4980 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4981 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4982 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4983
4984 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4985 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4986
4987 bool IsWave32 = ST.isWave32();
4988 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4989 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4990
4991 // Create initail values of induction variable from Exec, Accumulator and
4992 // insert branch instr to newly created ComputeBlockk
4993 uint32_t InitalValue =
4994 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4995 auto TmpSReg =
4996 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4997 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4998 .addImm(InitalValue);
4999 // clang-format off
5000 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5001 .addMBB(ComputeLoop);
5002 // clang-format on
5003
5004 // Start constructing ComputeLoop
5005 I = ComputeLoop->end();
5006 auto Accumulator =
5007 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5008 .addReg(InitalValReg)
5009 .addMBB(&BB);
5010 auto ActiveBits =
5011 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5012 .addReg(TmpSReg->getOperand(0).getReg())
5013 .addMBB(&BB);
5014
5015 // Perform the computations
5016 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5017 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5018 .addReg(ActiveBits->getOperand(0).getReg());
5019 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5020 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5021 .addReg(SrcReg)
5022 .addReg(FF1->getOperand(0).getReg());
5023 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5024 .addReg(Accumulator->getOperand(0).getReg())
5025 .addReg(LaneValue->getOperand(0).getReg());
5026
5027 // Manipulate the iterator to get the next active lane
5028 unsigned BITSETOpc =
5029 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5030 auto NewActiveBits =
5031 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5032 .addReg(FF1->getOperand(0).getReg())
5033 .addReg(ActiveBits->getOperand(0).getReg());
5034
5035 // Add phi nodes
5036 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5037 .addMBB(ComputeLoop);
5038 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5039 .addMBB(ComputeLoop);
5040
5041 // Creating branching
5042 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5043 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5044 .addReg(NewActiveBits->getOperand(0).getReg())
5045 .addImm(0);
5046 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5047 .addMBB(ComputeLoop);
5048
5049 RetBB = ComputeEnd;
5050 }
5051 MI.eraseFromParent();
5052 return RetBB;
5053}
5054
5057 MachineBasicBlock *BB) const {
5058
5060 MachineFunction *MF = BB->getParent();
5062
5063 switch (MI.getOpcode()) {
5064 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5065 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5066 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5067 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5068 case AMDGPU::S_UADDO_PSEUDO:
5069 case AMDGPU::S_USUBO_PSEUDO: {
5070 const DebugLoc &DL = MI.getDebugLoc();
5071 MachineOperand &Dest0 = MI.getOperand(0);
5072 MachineOperand &Dest1 = MI.getOperand(1);
5073 MachineOperand &Src0 = MI.getOperand(2);
5074 MachineOperand &Src1 = MI.getOperand(3);
5075
5076 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5077 ? AMDGPU::S_ADD_I32
5078 : AMDGPU::S_SUB_I32;
5079 // clang-format off
5080 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5081 .add(Src0)
5082 .add(Src1);
5083 // clang-format on
5084
5085 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5086 .addImm(1)
5087 .addImm(0);
5088
5089 MI.eraseFromParent();
5090 return BB;
5091 }
5092 case AMDGPU::S_ADD_U64_PSEUDO:
5093 case AMDGPU::S_SUB_U64_PSEUDO: {
5094 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5095 // For GFX12, we emit s_add_u64 and s_sub_u64.
5096 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5098 const DebugLoc &DL = MI.getDebugLoc();
5099 MachineOperand &Dest = MI.getOperand(0);
5100 MachineOperand &Src0 = MI.getOperand(1);
5101 MachineOperand &Src1 = MI.getOperand(2);
5102 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5103 if (Subtarget->hasScalarAddSub64()) {
5104 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5105 // clang-format off
5106 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5107 .add(Src0)
5108 .add(Src1);
5109 // clang-format on
5110 } else {
5111 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5112 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5113
5114 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5115 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5116
5117 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5118 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5119 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5120 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5121
5122 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5123 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5124 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5125 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5126
5127 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5128 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5129 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5130 .add(Src0Sub0)
5131 .add(Src1Sub0);
5132 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5133 .add(Src0Sub1)
5134 .add(Src1Sub1);
5135 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5136 .addReg(DestSub0)
5137 .addImm(AMDGPU::sub0)
5138 .addReg(DestSub1)
5139 .addImm(AMDGPU::sub1);
5140 }
5141 MI.eraseFromParent();
5142 return BB;
5143 }
5144 case AMDGPU::V_ADD_U64_PSEUDO:
5145 case AMDGPU::V_SUB_U64_PSEUDO: {
5147 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5148 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5149 const DebugLoc &DL = MI.getDebugLoc();
5150
5151 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5152
5153 MachineOperand &Dest = MI.getOperand(0);
5154 MachineOperand &Src0 = MI.getOperand(1);
5155 MachineOperand &Src1 = MI.getOperand(2);
5156
5157 if (IsAdd && ST.hasLshlAddB64()) {
5158 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5159 Dest.getReg())
5160 .add(Src0)
5161 .addImm(0)
5162 .add(Src1);
5163 TII->legalizeOperands(*Add);
5164 MI.eraseFromParent();
5165 return BB;
5166 }
5167
5168 const auto *CarryRC = TRI->getWaveMaskRegClass();
5169
5170 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5171 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5172
5173 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5174 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5175
5176 const TargetRegisterClass *Src0RC = Src0.isReg()
5177 ? MRI.getRegClass(Src0.getReg())
5178 : &AMDGPU::VReg_64RegClass;
5179 const TargetRegisterClass *Src1RC = Src1.isReg()
5180 ? MRI.getRegClass(Src1.getReg())
5181 : &AMDGPU::VReg_64RegClass;
5182
5183 const TargetRegisterClass *Src0SubRC =
5184 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5185 const TargetRegisterClass *Src1SubRC =
5186 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5187
5188 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5189 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5190 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5191 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5192
5193 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5194 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5195 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5196 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5197
5198 unsigned LoOpc =
5199 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5200 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5201 .addReg(CarryReg, RegState::Define)
5202 .add(SrcReg0Sub0)
5203 .add(SrcReg1Sub0)
5204 .addImm(0); // clamp bit
5205
5206 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5207 MachineInstr *HiHalf =
5208 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5209 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5210 .add(SrcReg0Sub1)
5211 .add(SrcReg1Sub1)
5212 .addReg(CarryReg, RegState::Kill)
5213 .addImm(0); // clamp bit
5214
5215 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5216 .addReg(DestSub0)
5217 .addImm(AMDGPU::sub0)
5218 .addReg(DestSub1)
5219 .addImm(AMDGPU::sub1);
5220 TII->legalizeOperands(*LoHalf);
5221 TII->legalizeOperands(*HiHalf);
5222 MI.eraseFromParent();
5223 return BB;
5224 }
5225 case AMDGPU::S_ADD_CO_PSEUDO:
5226 case AMDGPU::S_SUB_CO_PSEUDO: {
5227 // This pseudo has a chance to be selected
5228 // only from uniform add/subcarry node. All the VGPR operands
5229 // therefore assumed to be splat vectors.
5231 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5232 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5234 const DebugLoc &DL = MI.getDebugLoc();
5235 MachineOperand &Dest = MI.getOperand(0);
5236 MachineOperand &CarryDest = MI.getOperand(1);
5237 MachineOperand &Src0 = MI.getOperand(2);
5238 MachineOperand &Src1 = MI.getOperand(3);
5239 MachineOperand &Src2 = MI.getOperand(4);
5240 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5241 ? AMDGPU::S_ADDC_U32
5242 : AMDGPU::S_SUBB_U32;
5243 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5244 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5245 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5246 .addReg(Src0.getReg());
5247 Src0.setReg(RegOp0);
5248 }
5249 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5250 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5251 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5252 .addReg(Src1.getReg());
5253 Src1.setReg(RegOp1);
5254 }
5255 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5256 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5257 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5258 .addReg(Src2.getReg());
5259 Src2.setReg(RegOp2);
5260 }
5261
5262 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5263 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5264 assert(WaveSize == 64 || WaveSize == 32);
5265
5266 if (WaveSize == 64) {
5267 if (ST.hasScalarCompareEq64()) {
5268 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5269 .addReg(Src2.getReg())
5270 .addImm(0);
5271 } else {
5272 const TargetRegisterClass *SubRC =
5273 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5274 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5275 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5276 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5277 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5278 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5279
5280 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5281 .add(Src2Sub0)
5282 .add(Src2Sub1);
5283
5284 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5285 .addReg(Src2_32, RegState::Kill)
5286 .addImm(0);
5287 }
5288 } else {
5289 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5290 .addReg(Src2.getReg())
5291 .addImm(0);
5292 }
5293
5294 // clang-format off
5295 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5296 .add(Src0)
5297 .add(Src1);
5298 // clang-format on
5299
5300 unsigned SelOpc =
5301 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5302
5303 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5304 .addImm(-1)
5305 .addImm(0);
5306
5307 MI.eraseFromParent();
5308 return BB;
5309 }
5310 case AMDGPU::SI_INIT_M0: {
5311 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5312 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5313 .add(MI.getOperand(0));
5314 MI.eraseFromParent();
5315 return BB;
5316 }
5317 case AMDGPU::GET_GROUPSTATICSIZE: {
5318 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5319 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5320 DebugLoc DL = MI.getDebugLoc();
5321 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5322 .add(MI.getOperand(0))
5323 .addImm(MFI->getLDSSize());
5324 MI.eraseFromParent();
5325 return BB;
5326 }
5327 case AMDGPU::GET_SHADERCYCLESHILO: {
5330 const DebugLoc &DL = MI.getDebugLoc();
5331 // The algorithm is:
5332 //
5333 // hi1 = getreg(SHADER_CYCLES_HI)
5334 // lo1 = getreg(SHADER_CYCLES_LO)
5335 // hi2 = getreg(SHADER_CYCLES_HI)
5336 //
5337 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5338 // Otherwise there was overflow and the result is hi2:0. In both cases the
5339 // result should represent the actual time at some point during the sequence
5340 // of three getregs.
5341 using namespace AMDGPU::Hwreg;
5342 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5343 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5344 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5345 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5346 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5347 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5348 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5349 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5350 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5351 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5352 .addReg(RegHi1)
5353 .addReg(RegHi2);
5354 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5355 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5356 .addReg(RegLo1)
5357 .addImm(0);
5358 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5359 .add(MI.getOperand(0))
5360 .addReg(RegLo)
5361 .addImm(AMDGPU::sub0)
5362 .addReg(RegHi2)
5363 .addImm(AMDGPU::sub1);
5364 MI.eraseFromParent();
5365 return BB;
5366 }
5367 case AMDGPU::SI_INDIRECT_SRC_V1:
5368 case AMDGPU::SI_INDIRECT_SRC_V2:
5369 case AMDGPU::SI_INDIRECT_SRC_V4:
5370 case AMDGPU::SI_INDIRECT_SRC_V8:
5371 case AMDGPU::SI_INDIRECT_SRC_V9:
5372 case AMDGPU::SI_INDIRECT_SRC_V10:
5373 case AMDGPU::SI_INDIRECT_SRC_V11:
5374 case AMDGPU::SI_INDIRECT_SRC_V12:
5375 case AMDGPU::SI_INDIRECT_SRC_V16:
5376 case AMDGPU::SI_INDIRECT_SRC_V32:
5377 return emitIndirectSrc(MI, *BB, *getSubtarget());
5378 case AMDGPU::SI_INDIRECT_DST_V1:
5379 case AMDGPU::SI_INDIRECT_DST_V2:
5380 case AMDGPU::SI_INDIRECT_DST_V4:
5381 case AMDGPU::SI_INDIRECT_DST_V8:
5382 case AMDGPU::SI_INDIRECT_DST_V9:
5383 case AMDGPU::SI_INDIRECT_DST_V10:
5384 case AMDGPU::SI_INDIRECT_DST_V11:
5385 case AMDGPU::SI_INDIRECT_DST_V12:
5386 case AMDGPU::SI_INDIRECT_DST_V16:
5387 case AMDGPU::SI_INDIRECT_DST_V32:
5388 return emitIndirectDst(MI, *BB, *getSubtarget());
5389 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5390 case AMDGPU::SI_KILL_I1_PSEUDO:
5391 return splitKillBlock(MI, BB);
5392 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5394 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5395 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5396
5397 Register Dst = MI.getOperand(0).getReg();
5398 const MachineOperand &Src0 = MI.getOperand(1);
5399 const MachineOperand &Src1 = MI.getOperand(2);
5400 const DebugLoc &DL = MI.getDebugLoc();
5401 Register SrcCond = MI.getOperand(3).getReg();
5402
5403 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5404 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5405 const auto *CondRC = TRI->getWaveMaskRegClass();
5406 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5407
5408 const TargetRegisterClass *Src0RC = Src0.isReg()
5409 ? MRI.getRegClass(Src0.getReg())
5410 : &AMDGPU::VReg_64RegClass;
5411 const TargetRegisterClass *Src1RC = Src1.isReg()
5412 ? MRI.getRegClass(Src1.getReg())
5413 : &AMDGPU::VReg_64RegClass;
5414
5415 const TargetRegisterClass *Src0SubRC =
5416 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5417 const TargetRegisterClass *Src1SubRC =
5418 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5419
5420 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5421 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5422 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5423 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5424
5425 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5426 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5427 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5428 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5429
5430 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5431 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5432 .addImm(0)
5433 .add(Src0Sub0)
5434 .addImm(0)
5435 .add(Src1Sub0)
5436 .addReg(SrcCondCopy);
5437 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5438 .addImm(0)
5439 .add(Src0Sub1)
5440 .addImm(0)
5441 .add(Src1Sub1)
5442 .addReg(SrcCondCopy);
5443
5444 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5445 .addReg(DstLo)
5446 .addImm(AMDGPU::sub0)
5447 .addReg(DstHi)
5448 .addImm(AMDGPU::sub1);
5449 MI.eraseFromParent();
5450 return BB;
5451 }
5452 case AMDGPU::SI_BR_UNDEF: {
5454 const DebugLoc &DL = MI.getDebugLoc();
5455 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5456 .add(MI.getOperand(0));
5457 Br->getOperand(1).setIsUndef(); // read undef SCC
5458 MI.eraseFromParent();
5459 return BB;
5460 }
5461 case AMDGPU::ADJCALLSTACKUP:
5462 case AMDGPU::ADJCALLSTACKDOWN: {
5464 MachineInstrBuilder MIB(*MF, &MI);
5465 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5466 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5467 return BB;
5468 }
5469 case AMDGPU::SI_CALL_ISEL: {
5471 const DebugLoc &DL = MI.getDebugLoc();
5472
5473 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5474
5476 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5477
5478 for (const MachineOperand &MO : MI.operands())
5479 MIB.add(MO);
5480
5481 MIB.cloneMemRefs(MI);
5482 MI.eraseFromParent();
5483 return BB;
5484 }
5485 case AMDGPU::V_ADD_CO_U32_e32:
5486 case AMDGPU::V_SUB_CO_U32_e32:
5487 case AMDGPU::V_SUBREV_CO_U32_e32: {
5488 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5489 const DebugLoc &DL = MI.getDebugLoc();
5490 unsigned Opc = MI.getOpcode();
5491
5492 bool NeedClampOperand = false;
5493 if (TII->pseudoToMCOpcode(Opc) == -1) {
5494 Opc = AMDGPU::getVOPe64(Opc);
5495 NeedClampOperand = true;
5496 }
5497
5498 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5499 if (TII->isVOP3(*I)) {
5500 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5501 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5502 I.addReg(TRI->getVCC(), RegState::Define);
5503 }
5504 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5505 if (NeedClampOperand)
5506 I.addImm(0); // clamp bit for e64 encoding
5507
5508 TII->legalizeOperands(*I);
5509
5510 MI.eraseFromParent();
5511 return BB;
5512 }
5513 case AMDGPU::V_ADDC_U32_e32:
5514 case AMDGPU::V_SUBB_U32_e32:
5515 case AMDGPU::V_SUBBREV_U32_e32:
5516 // These instructions have an implicit use of vcc which counts towards the
5517 // constant bus limit.
5518 TII->legalizeOperands(MI);
5519 return BB;
5520 case AMDGPU::DS_GWS_INIT:
5521 case AMDGPU::DS_GWS_SEMA_BR:
5522 case AMDGPU::DS_GWS_BARRIER:
5523 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5524 [[fallthrough]];
5525 case AMDGPU::DS_GWS_SEMA_V:
5526 case AMDGPU::DS_GWS_SEMA_P:
5527 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5528 // A s_waitcnt 0 is required to be the instruction immediately following.
5529 if (getSubtarget()->hasGWSAutoReplay()) {
5531 return BB;
5532 }
5533
5534 return emitGWSMemViolTestLoop(MI, BB);
5535 case AMDGPU::S_SETREG_B32: {
5536 // Try to optimize cases that only set the denormal mode or rounding mode.
5537 //
5538 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5539 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5540 // instead.
5541 //
5542 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5543 // allow you to have a no side effect instruction in the output of a
5544 // sideeffecting pattern.
5545 auto [ID, Offset, Width] =
5546 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5548 return BB;
5549
5550 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5551 const unsigned SetMask = WidthMask << Offset;
5552
5553 if (getSubtarget()->hasDenormModeInst()) {
5554 unsigned SetDenormOp = 0;
5555 unsigned SetRoundOp = 0;
5556
5557 // The dedicated instructions can only set the whole denorm or round mode
5558 // at once, not a subset of bits in either.
5559 if (SetMask ==
5561 // If this fully sets both the round and denorm mode, emit the two
5562 // dedicated instructions for these.
5563 SetRoundOp = AMDGPU::S_ROUND_MODE;
5564 SetDenormOp = AMDGPU::S_DENORM_MODE;
5565 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5566 SetRoundOp = AMDGPU::S_ROUND_MODE;
5567 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5568 SetDenormOp = AMDGPU::S_DENORM_MODE;
5569 }
5570
5571 if (SetRoundOp || SetDenormOp) {
5573 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5574 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5575 unsigned ImmVal = Def->getOperand(1).getImm();
5576 if (SetRoundOp) {
5577 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5578 .addImm(ImmVal & 0xf);
5579
5580 // If we also have the denorm mode, get just the denorm mode bits.
5581 ImmVal >>= 4;
5582 }
5583
5584 if (SetDenormOp) {
5585 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5586 .addImm(ImmVal & 0xf);
5587 }
5588
5589 MI.eraseFromParent();
5590 return BB;
5591 }
5592 }
5593 }
5594
5595 // If only FP bits are touched, used the no side effects pseudo.
5596 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5597 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5598 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5599
5600 return BB;
5601 }
5602 case AMDGPU::S_INVERSE_BALLOT_U32:
5603 case AMDGPU::S_INVERSE_BALLOT_U64:
5604 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5605 // necessary. After that they are equivalent to a COPY.
5606 MI.setDesc(TII->get(AMDGPU::COPY));
5607 return BB;
5608 case AMDGPU::ENDPGM_TRAP: {
5609 const DebugLoc &DL = MI.getDebugLoc();
5610 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5611 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5612 MI.addOperand(MachineOperand::CreateImm(0));
5613 return BB;
5614 }
5615
5616 // We need a block split to make the real endpgm a terminator. We also don't
5617 // want to break phis in successor blocks, so we can't just delete to the
5618 // end of the block.
5619
5620 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5622 MF->push_back(TrapBB);
5623 // clang-format off
5624 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5625 .addImm(0);
5626 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5627 .addMBB(TrapBB);
5628 // clang-format on
5629
5630 BB->addSuccessor(TrapBB);
5631 MI.eraseFromParent();
5632 return SplitBB;
5633 }
5634 case AMDGPU::SIMULATED_TRAP: {
5635 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5637 MachineBasicBlock *SplitBB =
5638 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5639 MI.eraseFromParent();
5640 return SplitBB;
5641 }
5642 default:
5643 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5644 if (!MI.mayStore())
5646 return BB;
5647 }
5649 }
5650}
5651
5653 // This currently forces unfolding various combinations of fsub into fma with
5654 // free fneg'd operands. As long as we have fast FMA (controlled by
5655 // isFMAFasterThanFMulAndFAdd), we should perform these.
5656
5657 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5658 // most of these combines appear to be cycle neutral but save on instruction
5659 // count / code size.
5660 return true;
5661}
5662
5664
5666 EVT VT) const {
5667 if (!VT.isVector()) {
5668 return MVT::i1;
5669 }
5670 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5671}
5672
5674 // TODO: Should i16 be used always if legal? For now it would force VALU
5675 // shifts.
5676 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5677}
5678
5680 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5681 ? Ty.changeElementSize(16)
5682 : Ty.changeElementSize(32);
5683}
5684
5685// Answering this is somewhat tricky and depends on the specific device which
5686// have different rates for fma or all f64 operations.
5687//
5688// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5689// regardless of which device (although the number of cycles differs between
5690// devices), so it is always profitable for f64.
5691//
5692// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5693// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5694// which we can always do even without fused FP ops since it returns the same
5695// result as the separate operations and since it is always full
5696// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5697// however does not support denormals, so we do report fma as faster if we have
5698// a fast fma device and require denormals.
5699//
5701 EVT VT) const {
5702 VT = VT.getScalarType();
5703
5704 switch (VT.getSimpleVT().SimpleTy) {
5705 case MVT::f32: {
5706 // If mad is not available this depends only on if f32 fma is full rate.
5707 if (!Subtarget->hasMadMacF32Insts())
5708 return Subtarget->hasFastFMAF32();
5709
5710 // Otherwise f32 mad is always full rate and returns the same result as
5711 // the separate operations so should be preferred over fma.
5712 // However does not support denormals.
5714 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5715
5716 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5717 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5718 }
5719 case MVT::f64:
5720 return true;
5721 case MVT::f16:
5722 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5723 default:
5724 break;
5725 }
5726
5727 return false;
5728}
5729
5731 LLT Ty) const {
5732 switch (Ty.getScalarSizeInBits()) {
5733 case 16:
5734 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5735 case 32:
5736 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5737 case 64:
5738 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5739 default:
5740 break;
5741 }
5742
5743 return false;
5744}
5745
5746// Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for
5747// specific details.
5749 Type *Ty) const {
5750 switch (Ty->getScalarSizeInBits()) {
5751 case 16: {
5753 return Subtarget->has16BitInsts() &&
5754 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
5755 }
5756 case 32: {
5757 if (!Subtarget->hasMadMacF32Insts())
5758 return Subtarget->hasFastFMAF32();
5759
5761 if (Mode.FP32Denormals != DenormalMode::getPreserveSign())
5762 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5763
5764 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5765 }
5766 case 64:
5767 return true;
5768 default:
5769 break;
5770 }
5771
5772 return false;
5773}
5774
5776 if (!Ty.isScalar())
5777 return false;
5778
5779 if (Ty.getScalarSizeInBits() == 16)
5780 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5781 if (Ty.getScalarSizeInBits() == 32)
5782 return Subtarget->hasMadMacF32Insts() &&
5783 denormalModeIsFlushAllF32(*MI.getMF());
5784
5785 return false;
5786}
5787
5789 const SDNode *N) const {
5790 // TODO: Check future ftz flag
5791 // v_mad_f32/v_mac_f32 do not support denormals.
5792 EVT VT = N->getValueType(0);
5793 if (VT == MVT::f32)
5794 return Subtarget->hasMadMacF32Insts() &&
5796 if (VT == MVT::f16) {
5797 return Subtarget->hasMadF16() &&
5799 }
5800
5801 return false;
5802}
5803
5804//===----------------------------------------------------------------------===//
5805// Custom DAG Lowering Operations
5806//===----------------------------------------------------------------------===//
5807
5808// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5809// wider vector type is legal.
5811 SelectionDAG &DAG) const {
5812 unsigned Opc = Op.getOpcode();
5813 EVT VT = Op.getValueType();
5814 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5815 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5816 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5817 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5818
5819 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
5820
5821 SDLoc SL(Op);
5822 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
5823 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
5824
5825 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5826}
5827
5828// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5829// wider vector type is legal.
5831 SelectionDAG &DAG) const {
5832 unsigned Opc = Op.getOpcode();
5833 EVT VT = Op.getValueType();
5834 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5835 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5836 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5837 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5838
5839 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
5840 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5841
5842 SDLoc SL(Op);
5843
5844 SDValue OpLo =
5845 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
5846 SDValue OpHi =
5847 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
5848
5849 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5850}
5851
5853 SelectionDAG &DAG) const {
5854 unsigned Opc = Op.getOpcode();
5855 EVT VT = Op.getValueType();
5856 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5857 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5858 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5859 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5860 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5861 VT == MVT::v32bf16);
5862
5863 SDValue Op0 = Op.getOperand(0);
5864 auto [Lo0, Hi0] = Op0.getValueType().isVector()
5865 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5866 : std::pair(Op0, Op0);
5867
5868 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5869 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
5870
5871 SDLoc SL(Op);
5872 auto ResVT = DAG.GetSplitDestVTs(VT);
5873
5874 SDValue OpLo =
5875 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
5876 SDValue OpHi =
5877 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
5878
5879 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5880}
5881
5883 switch (Op.getOpcode()) {
5884 default:
5886 case ISD::BRCOND:
5887 return LowerBRCOND(Op, DAG);
5888 case ISD::RETURNADDR:
5889 return LowerRETURNADDR(Op, DAG);
5890 case ISD::LOAD: {
5891 SDValue Result = LowerLOAD(Op, DAG);
5892 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5893 "Load should return a value and a chain");
5894 return Result;
5895 }
5896 case ISD::FSQRT: {
5897 EVT VT = Op.getValueType();
5898 if (VT == MVT::f32)
5899 return lowerFSQRTF32(Op, DAG);
5900 if (VT == MVT::f64)
5901 return lowerFSQRTF64(Op, DAG);
5902 return SDValue();
5903 }
5904 case ISD::FSIN:
5905 case ISD::FCOS:
5906 return LowerTrig(Op, DAG);
5907 case ISD::SELECT:
5908 return LowerSELECT(Op, DAG);
5909 case ISD::FDIV:
5910 return LowerFDIV(Op, DAG);
5911 case ISD::FFREXP:
5912 return LowerFFREXP(Op, DAG);
5914 return LowerATOMIC_CMP_SWAP(Op, DAG);
5915 case ISD::STORE:
5916 return LowerSTORE(Op, DAG);
5917 case ISD::GlobalAddress: {
5920 return LowerGlobalAddress(MFI, Op, DAG);
5921 }
5923 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5925 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5927 return LowerINTRINSIC_VOID(Op, DAG);
5928 case ISD::ADDRSPACECAST:
5929 return lowerADDRSPACECAST(Op, DAG);
5931 return lowerINSERT_SUBVECTOR(Op, DAG);
5933 return lowerINSERT_VECTOR_ELT(Op, DAG);
5935 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5937 return lowerVECTOR_SHUFFLE(Op, DAG);
5939 return lowerSCALAR_TO_VECTOR(Op, DAG);
5940 case ISD::BUILD_VECTOR:
5941 return lowerBUILD_VECTOR(Op, DAG);
5942 case ISD::FP_ROUND:
5944 return lowerFP_ROUND(Op, DAG);
5945 case ISD::TRAP:
5946 return lowerTRAP(Op, DAG);
5947 case ISD::DEBUGTRAP:
5948 return lowerDEBUGTRAP(Op, DAG);
5949 case ISD::ABS:
5950 case ISD::FABS:
5951 case ISD::FNEG:
5952 case ISD::FCANONICALIZE:
5953 case ISD::BSWAP:
5954 return splitUnaryVectorOp(Op, DAG);
5955 case ISD::FMINNUM:
5956 case ISD::FMAXNUM:
5957 return lowerFMINNUM_FMAXNUM(Op, DAG);
5958 case ISD::FLDEXP:
5959 case ISD::STRICT_FLDEXP:
5960 return lowerFLDEXP(Op, DAG);
5961 case ISD::FMA:
5962 return splitTernaryVectorOp(Op, DAG);
5963 case ISD::FP_TO_SINT:
5964 case ISD::FP_TO_UINT:
5965 return LowerFP_TO_INT(Op, DAG);
5966 case ISD::SHL:
5967 case ISD::SRA:
5968 case ISD::SRL:
5969 case ISD::ADD:
5970 case ISD::SUB:
5971 case ISD::SMIN:
5972 case ISD::SMAX:
5973 case ISD::UMIN:
5974 case ISD::UMAX:
5975 case ISD::FADD:
5976 case ISD::FMUL:
5977 case ISD::FMINNUM_IEEE:
5978 case ISD::FMAXNUM_IEEE:
5979 case ISD::FMINIMUM:
5980 case ISD::FMAXIMUM:
5981 case ISD::FMINIMUMNUM:
5982 case ISD::FMAXIMUMNUM:
5983 case ISD::UADDSAT:
5984 case ISD::USUBSAT:
5985 case ISD::SADDSAT:
5986 case ISD::SSUBSAT:
5987 return splitBinaryVectorOp(Op, DAG);
5988 case ISD::MUL:
5989 return lowerMUL(Op, DAG);
5990 case ISD::SMULO:
5991 case ISD::UMULO:
5992 return lowerXMULO(Op, DAG);
5993 case ISD::SMUL_LOHI:
5994 case ISD::UMUL_LOHI:
5995 return lowerXMUL_LOHI(Op, DAG);
5997 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5998 case ISD::STACKSAVE:
5999 return LowerSTACKSAVE(Op, DAG);
6000 case ISD::GET_ROUNDING:
6001 return lowerGET_ROUNDING(Op, DAG);
6002 case ISD::SET_ROUNDING:
6003 return lowerSET_ROUNDING(Op, DAG);
6004 case ISD::PREFETCH:
6005 return lowerPREFETCH(Op, DAG);
6006 case ISD::FP_EXTEND:
6008 return lowerFP_EXTEND(Op, DAG);
6009 case ISD::GET_FPENV:
6010 return lowerGET_FPENV(Op, DAG);
6011 case ISD::SET_FPENV:
6012 return lowerSET_FPENV(Op, DAG);
6013 }
6014 return SDValue();
6015}
6016
6017// Used for D16: Casts the result of an instruction into the right vector,
6018// packs values if loads return unpacked values.
6020 const SDLoc &DL, SelectionDAG &DAG,
6021 bool Unpacked) {
6022 if (!LoadVT.isVector())
6023 return Result;
6024
6025 // Cast back to the original packed type or to a larger type that is a
6026 // multiple of 32 bit for D16. Widening the return type is a required for
6027 // legalization.
6028 EVT FittingLoadVT = LoadVT;
6029 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6030 FittingLoadVT =
6032 LoadVT.getVectorNumElements() + 1);
6033 }
6034
6035 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6036 // Truncate to v2i16/v4i16.
6037 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6038
6039 // Workaround legalizer not scalarizing truncate after vector op
6040 // legalization but not creating intermediate vector trunc.
6042 DAG.ExtractVectorElements(Result, Elts);
6043 for (SDValue &Elt : Elts)
6044 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6045
6046 // Pad illegal v1i16/v3fi6 to v4i16
6047 if ((LoadVT.getVectorNumElements() % 2) == 1)
6048 Elts.push_back(DAG.getUNDEF(MVT::i16));
6049
6050 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6051
6052 // Bitcast to original type (v2f16/v4f16).
6053 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6054 }
6055
6056 // Cast back to the original packed type.
6057 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6058}
6059
6060SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6061 SelectionDAG &DAG,
6063 bool IsIntrinsic) const {
6064 SDLoc DL(M);
6065
6066 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6067 EVT LoadVT = M->getValueType(0);
6068
6069 EVT EquivLoadVT = LoadVT;
6070 if (LoadVT.isVector()) {
6071 if (Unpacked) {
6072 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6073 LoadVT.getVectorNumElements());
6074 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6075 // Widen v3f16 to legal type
6076 EquivLoadVT =
6078 LoadVT.getVectorNumElements() + 1);
6079 }
6080 }
6081
6082 // Change from v4f16/v2f16 to EquivLoadVT.
6083 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6084
6086 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6087 M->getMemoryVT(), M->getMemOperand());
6088
6089 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6090
6091 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6092}
6093
6094SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6095 SelectionDAG &DAG,
6096 ArrayRef<SDValue> Ops) const {
6097 SDLoc DL(M);
6098 EVT LoadVT = M->getValueType(0);
6099 EVT EltType = LoadVT.getScalarType();
6100 EVT IntVT = LoadVT.changeTypeToInteger();
6101
6102 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6103
6104 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6105 bool IsTFE = M->getNumValues() == 3;
6106
6107 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6111
6112 if (IsD16) {
6113 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6114 }
6115
6116 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6117 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6118 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6119 IsTFE);
6120
6121 if (isTypeLegal(LoadVT)) {
6122 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6123 M->getMemOperand(), DAG);
6124 }
6125
6126 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6127 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6128 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6129 M->getMemOperand(), DAG);
6130 return DAG.getMergeValues(
6131 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6132 DL);
6133}
6134
6136 SelectionDAG &DAG) {
6137 EVT VT = N->getValueType(0);
6138 unsigned CondCode = N->getConstantOperandVal(3);
6139 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6140 return DAG.getUNDEF(VT);
6141
6142 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6143
6144 SDValue LHS = N->getOperand(1);
6145 SDValue RHS = N->getOperand(2);
6146
6147 SDLoc DL(N);
6148
6149 EVT CmpVT = LHS.getValueType();
6150 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6151 unsigned PromoteOp =
6153 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6154 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6155 }
6156
6157 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6158
6159 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6160 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6161
6162 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6163 DAG.getCondCode(CCOpcode));
6164 if (VT.bitsEq(CCVT))
6165 return SetCC;
6166 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6167}
6168
6170 SelectionDAG &DAG) {
6171 EVT VT = N->getValueType(0);
6172
6173 unsigned CondCode = N->getConstantOperandVal(3);
6174 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6175 return DAG.getUNDEF(VT);
6176
6177 SDValue Src0 = N->getOperand(1);
6178 SDValue Src1 = N->getOperand(2);
6179 EVT CmpVT = Src0.getValueType();
6180 SDLoc SL(N);
6181
6182 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6183 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6184 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6185 }
6186
6187 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6188 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6189 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6190 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6191 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6192 DAG.getCondCode(CCOpcode));
6193 if (VT.bitsEq(CCVT))
6194 return SetCC;
6195 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6196}
6197
6199 SelectionDAG &DAG) {
6200 EVT VT = N->getValueType(0);
6201 SDValue Src = N->getOperand(1);
6202 SDLoc SL(N);
6203
6204 if (Src.getOpcode() == ISD::SETCC) {
6205 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6206 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6207 Src.getOperand(1), Src.getOperand(2));
6208 }
6209 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6210 // (ballot 0) -> 0
6211 if (Arg->isZero())
6212 return DAG.getConstant(0, SL, VT);
6213
6214 // (ballot 1) -> EXEC/EXEC_LO
6215 if (Arg->isOne()) {
6216 Register Exec;
6217 if (VT.getScalarSizeInBits() == 32)
6218 Exec = AMDGPU::EXEC_LO;
6219 else if (VT.getScalarSizeInBits() == 64)
6220 Exec = AMDGPU::EXEC;
6221 else
6222 return SDValue();
6223
6224 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6225 }
6226 }
6227
6228 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6229 // ISD::SETNE)
6230 return DAG.getNode(
6231 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6232 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6233}
6234
6236 SelectionDAG &DAG) {
6237 EVT VT = N->getValueType(0);
6238 unsigned ValSize = VT.getSizeInBits();
6239 unsigned IID = N->getConstantOperandVal(0);
6240 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6241 IID == Intrinsic::amdgcn_permlanex16;
6242 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6243 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6244 SDLoc SL(N);
6245 MVT IntVT = MVT::getIntegerVT(ValSize);
6246 const GCNSubtarget *ST = TLI.getSubtarget();
6247 unsigned SplitSize = 32;
6248 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6249 ST->hasDPALU_DPP() &&
6250 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6251 SplitSize = 64;
6252
6253 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6254 SDValue Src2, MVT ValT) -> SDValue {
6256 switch (IID) {
6257 case Intrinsic::amdgcn_permlane16:
6258 case Intrinsic::amdgcn_permlanex16:
6259 case Intrinsic::amdgcn_update_dpp:
6260 Operands.push_back(N->getOperand(6));
6261 Operands.push_back(N->getOperand(5));
6262 Operands.push_back(N->getOperand(4));
6263 [[fallthrough]];
6264 case Intrinsic::amdgcn_writelane:
6265 Operands.push_back(Src2);
6266 [[fallthrough]];
6267 case Intrinsic::amdgcn_readlane:
6268 case Intrinsic::amdgcn_set_inactive:
6269 case Intrinsic::amdgcn_set_inactive_chain_arg:
6270 case Intrinsic::amdgcn_mov_dpp8:
6271 Operands.push_back(Src1);
6272 [[fallthrough]];
6273 case Intrinsic::amdgcn_readfirstlane:
6274 case Intrinsic::amdgcn_permlane64:
6275 Operands.push_back(Src0);
6276 break;
6277 default:
6278 llvm_unreachable("unhandled lane op");
6279 }
6280
6281 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6282 std::reverse(Operands.begin(), Operands.end());
6283
6284 if (SDNode *GL = N->getGluedNode()) {
6285 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6286 GL = GL->getOperand(0).getNode();
6287 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6288 SDValue(GL, 0)));
6289 }
6290
6291 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6292 };
6293
6294 SDValue Src0 = N->getOperand(1);
6295 SDValue Src1, Src2;
6296 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6297 IID == Intrinsic::amdgcn_mov_dpp8 ||
6298 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6299 Src1 = N->getOperand(2);
6300 if (IID == Intrinsic::amdgcn_writelane ||
6301 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6302 Src2 = N->getOperand(3);
6303 }
6304
6305 if (ValSize == SplitSize) {
6306 // Already legal
6307 return SDValue();
6308 }
6309
6310 if (ValSize < 32) {
6311 bool IsFloat = VT.isFloatingPoint();
6312 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6313 SL, MVT::i32);
6314
6315 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6316 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6317 SL, MVT::i32);
6318 }
6319
6320 if (IID == Intrinsic::amdgcn_writelane) {
6321 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6322 SL, MVT::i32);
6323 }
6324
6325 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6326 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6327 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6328 }
6329
6330 if (ValSize % SplitSize != 0)
6331 return SDValue();
6332
6333 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6334 EVT VT = N->getValueType(0);
6335 unsigned NE = VT.getVectorNumElements();
6336 EVT EltVT = VT.getVectorElementType();
6338 unsigned NumOperands = N->getNumOperands();
6339 SmallVector<SDValue, 4> Operands(NumOperands);
6340 SDNode *GL = N->getGluedNode();
6341
6342 // only handle convergencectrl_glue
6344
6345 for (unsigned i = 0; i != NE; ++i) {
6346 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6347 ++j) {
6348 SDValue Operand = N->getOperand(j);
6349 EVT OperandVT = Operand.getValueType();
6350 if (OperandVT.isVector()) {
6351 // A vector operand; extract a single element.
6352 EVT OperandEltVT = OperandVT.getVectorElementType();
6353 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6354 Operand, DAG.getVectorIdxConstant(i, SL));
6355 } else {
6356 // A scalar operand; just use it as is.
6357 Operands[j] = Operand;
6358 }
6359 }
6360
6361 if (GL)
6362 Operands[NumOperands - 1] =
6363 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6364 SDValue(GL->getOperand(0).getNode(), 0));
6365
6366 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6367 }
6368
6369 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6370 return DAG.getBuildVector(VecVT, SL, Scalars);
6371 };
6372
6373 if (VT.isVector()) {
6374 switch (MVT::SimpleValueType EltTy =
6376 case MVT::i32:
6377 case MVT::f32:
6378 if (SplitSize == 32) {
6379 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6380 return unrollLaneOp(LaneOp.getNode());
6381 }
6382 [[fallthrough]];
6383 case MVT::i16:
6384 case MVT::f16:
6385 case MVT::bf16: {
6386 unsigned SubVecNumElt =
6387 SplitSize / VT.getVectorElementType().getSizeInBits();
6388 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6390 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6391 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6392 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6393 DAG.getConstant(EltIdx, SL, MVT::i32));
6394
6395 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6396 IsPermLane16)
6397 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6398 DAG.getConstant(EltIdx, SL, MVT::i32));
6399
6400 if (IID == Intrinsic::amdgcn_writelane)
6401 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6402 DAG.getConstant(EltIdx, SL, MVT::i32));
6403
6404 Pieces.push_back(
6405 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6406 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6407 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6408 EltIdx += SubVecNumElt;
6409 }
6410 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6411 }
6412 default:
6413 // Handle all other cases by bitcasting to i32 vectors
6414 break;
6415 }
6416 }
6417
6418 MVT VecVT =
6419 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6420 Src0 = DAG.getBitcast(VecVT, Src0);
6421
6422 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6423 Src1 = DAG.getBitcast(VecVT, Src1);
6424
6425 if (IID == Intrinsic::amdgcn_writelane)
6426 Src2 = DAG.getBitcast(VecVT, Src2);
6427
6428 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6429 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6430 return DAG.getBitcast(VT, UnrolledLaneOp);
6431}
6432
6435 SelectionDAG &DAG) const {
6436 switch (N->getOpcode()) {
6438 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6439 Results.push_back(Res);
6440 return;
6441 }
6443 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6444 Results.push_back(Res);
6445 return;
6446 }
6448 unsigned IID = N->getConstantOperandVal(0);
6449 switch (IID) {
6450 case Intrinsic::amdgcn_make_buffer_rsrc:
6451 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6452 return;
6453 case Intrinsic::amdgcn_cvt_pkrtz: {
6454 SDValue Src0 = N->getOperand(1);
6455 SDValue Src1 = N->getOperand(2);
6456 SDLoc SL(N);
6457 SDValue Cvt =
6458 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6459 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6460 return;
6461 }
6462 case Intrinsic::amdgcn_cvt_pknorm_i16:
6463 case Intrinsic::amdgcn_cvt_pknorm_u16:
6464 case Intrinsic::amdgcn_cvt_pk_i16:
6465 case Intrinsic::amdgcn_cvt_pk_u16: {
6466 SDValue Src0 = N->getOperand(1);
6467 SDValue Src1 = N->getOperand(2);
6468 SDLoc SL(N);
6469 unsigned Opcode;
6470
6471 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6473 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6475 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6477 else
6479
6480 EVT VT = N->getValueType(0);
6481 if (isTypeLegal(VT))
6482 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6483 else {
6484 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6485 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6486 }
6487 return;
6488 }
6489 case Intrinsic::amdgcn_s_buffer_load: {
6490 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6491 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6492 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6493 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6494 // s_buffer_load_i8.
6495 if (!Subtarget->hasScalarSubwordLoads())
6496 return;
6497 SDValue Op = SDValue(N, 0);
6498 SDValue Rsrc = Op.getOperand(1);
6499 SDValue Offset = Op.getOperand(2);
6500 SDValue CachePolicy = Op.getOperand(3);
6501 EVT VT = Op.getValueType();
6502 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6503 SDLoc DL(Op);
6505 const DataLayout &DataLayout = DAG.getDataLayout();
6506 Align Alignment =
6512 VT.getStoreSize(), Alignment);
6513 SDValue LoadVal;
6514 if (!Offset->isDivergent()) {
6515 SDValue Ops[] = {Rsrc, // source register
6516 Offset, CachePolicy};
6517 SDValue BufferLoad =
6519 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6520 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6521 } else {
6522 SDValue Ops[] = {
6523 DAG.getEntryNode(), // Chain
6524 Rsrc, // rsrc
6525 DAG.getConstant(0, DL, MVT::i32), // vindex
6526 {}, // voffset
6527 {}, // soffset
6528 {}, // offset
6529 CachePolicy, // cachepolicy
6530 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6531 };
6532 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6533 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6534 }
6535 Results.push_back(LoadVal);
6536 return;
6537 }
6538 }
6539 break;
6540 }
6542 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6543 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6544 // FIXME: Hacky
6545 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6546 Results.push_back(Res.getOperand(I));
6547 }
6548 } else {
6549 Results.push_back(Res);
6550 Results.push_back(Res.getValue(1));
6551 }
6552 return;
6553 }
6554
6555 break;
6556 }
6557 case ISD::SELECT: {
6558 SDLoc SL(N);
6559 EVT VT = N->getValueType(0);
6560 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6561 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6562 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6563
6564 EVT SelectVT = NewVT;
6565 if (NewVT.bitsLT(MVT::i32)) {
6566 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6567 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6568 SelectVT = MVT::i32;
6569 }
6570
6571 SDValue NewSelect =
6572 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6573
6574 if (NewVT != SelectVT)
6575 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6576 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6577 return;
6578 }
6579 case ISD::FNEG: {
6580 if (N->getValueType(0) != MVT::v2f16)
6581 break;
6582
6583 SDLoc SL(N);
6584 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6585
6586 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6587 DAG.getConstant(0x80008000, SL, MVT::i32));
6588 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6589 return;
6590 }
6591 case ISD::FABS: {
6592 if (N->getValueType(0) != MVT::v2f16)
6593 break;
6594
6595 SDLoc SL(N);
6596 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6597
6598 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6599 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6600 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6601 return;
6602 }
6603 case ISD::FSQRT: {
6604 if (N->getValueType(0) != MVT::f16)
6605 break;
6606 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6607 break;
6608 }
6609 default:
6611 break;
6612 }
6613}
6614
6615/// Helper function for LowerBRCOND
6616static SDNode *findUser(SDValue Value, unsigned Opcode) {
6617
6618 for (SDUse &U : Value->uses()) {
6619 if (U.get() != Value)
6620 continue;
6621
6622 if (U.getUser()->getOpcode() == Opcode)
6623 return U.getUser();
6624 }
6625 return nullptr;
6626}
6627
6628unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6629 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6630 switch (Intr->getConstantOperandVal(1)) {
6631 case Intrinsic::amdgcn_if:
6632 return AMDGPUISD::IF;
6633 case Intrinsic::amdgcn_else:
6634 return AMDGPUISD::ELSE;
6635 case Intrinsic::amdgcn_loop:
6636 return AMDGPUISD::LOOP;
6637 case Intrinsic::amdgcn_end_cf:
6638 llvm_unreachable("should not occur");
6639 default:
6640 return 0;
6641 }
6642 }
6643
6644 // break, if_break, else_break are all only used as inputs to loop, not
6645 // directly as branch conditions.
6646 return 0;
6647}
6648
6650 const Triple &TT = getTargetMachine().getTargetTriple();
6654}
6655
6657 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6658 return false;
6659
6660 // FIXME: Either avoid relying on address space here or change the default
6661 // address space for functions to avoid the explicit check.
6662 return (GV->getValueType()->isFunctionTy() ||
6665}
6666
6668 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6669}
6670
6672 if (!GV->hasExternalLinkage())
6673 return true;
6674
6675 const auto OS = getTargetMachine().getTargetTriple().getOS();
6676 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6677}
6678
6679/// This transforms the control flow intrinsics to get the branch destination as
6680/// last parameter, also switches branch target with BR if the need arise
6681SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6682 SDLoc DL(BRCOND);
6683
6684 SDNode *Intr = BRCOND.getOperand(1).getNode();
6685 SDValue Target = BRCOND.getOperand(2);
6686 SDNode *BR = nullptr;
6687 SDNode *SetCC = nullptr;
6688
6689 if (Intr->getOpcode() == ISD::SETCC) {
6690 // As long as we negate the condition everything is fine
6691 SetCC = Intr;
6692 Intr = SetCC->getOperand(0).getNode();
6693
6694 } else {
6695 // Get the target from BR if we don't negate the condition
6696 BR = findUser(BRCOND, ISD::BR);
6697 assert(BR && "brcond missing unconditional branch user");
6698 Target = BR->getOperand(1);
6699 }
6700
6701 unsigned CFNode = isCFIntrinsic(Intr);
6702 if (CFNode == 0) {
6703 // This is a uniform branch so we don't need to legalize.
6704 return BRCOND;
6705 }
6706
6707 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6708 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6709
6710 assert(!SetCC ||
6711 (SetCC->getConstantOperandVal(1) == 1 &&
6712 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6713 ISD::SETNE));
6714
6715 // operands of the new intrinsic call
6717 if (HaveChain)
6718 Ops.push_back(BRCOND.getOperand(0));
6719
6720 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6721 Ops.push_back(Target);
6722
6723 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6724
6725 // build the new intrinsic call
6726 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6727
6728 if (!HaveChain) {
6729 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6730
6731 Result = DAG.getMergeValues(Ops, DL).getNode();
6732 }
6733
6734 if (BR) {
6735 // Give the branch instruction our target
6736 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6737 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6738 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6739 }
6740
6741 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6742
6743 // Copy the intrinsic results to registers
6744 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6746 if (!CopyToReg)
6747 continue;
6748
6749 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6750 SDValue(Result, i - 1), SDValue());
6751
6752 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6753 }
6754
6755 // Remove the old intrinsic from the chain
6756 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6757 Intr->getOperand(0));
6758
6759 return Chain;
6760}
6761
6762SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6763 MVT VT = Op.getSimpleValueType();
6764 SDLoc DL(Op);
6765 // Checking the depth
6766 if (Op.getConstantOperandVal(0) != 0)
6767 return DAG.getConstant(0, DL, VT);
6768
6771 // Check for kernel and shader functions
6772 if (Info->isEntryFunction())
6773 return DAG.getConstant(0, DL, VT);
6774
6775 MachineFrameInfo &MFI = MF.getFrameInfo();
6776 // There is a call to @llvm.returnaddress in this function
6777 MFI.setReturnAddressIsTaken(true);
6778
6780 // Get the return address reg and mark it as an implicit live-in
6781 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
6782 getRegClassFor(VT, Op.getNode()->isDivergent()));
6783
6784 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6785}
6786
6787SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
6788 const SDLoc &DL, EVT VT) const {
6789 return Op.getValueType().bitsLE(VT)
6790 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
6791 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6792 DAG.getTargetConstant(0, DL, MVT::i32));
6793}
6794
6795SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6796 assert(Op.getValueType() == MVT::f16 &&
6797 "Do not know how to custom lower FP_ROUND for non-f16 type");
6798
6799 SDValue Src = Op.getOperand(0);
6800 EVT SrcVT = Src.getValueType();
6801 if (SrcVT != MVT::f64)
6802 return Op;
6803
6804 // TODO: Handle strictfp
6805 if (Op.getOpcode() != ISD::FP_ROUND)
6806 return Op;
6807
6808 SDLoc DL(Op);
6809
6810 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6811 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6812 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6813}
6814
6815SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6816 SelectionDAG &DAG) const {
6817 EVT VT = Op.getValueType();
6818 const MachineFunction &MF = DAG.getMachineFunction();
6820 bool IsIEEEMode = Info->getMode().IEEE;
6821
6822 // FIXME: Assert during selection that this is only selected for
6823 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6824 // mode functions, but this happens to be OK since it's only done in cases
6825 // where there is known no sNaN.
6826 if (IsIEEEMode)
6827 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6828
6829 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6830 VT == MVT::v16bf16)
6831 return splitBinaryVectorOp(Op, DAG);
6832 return Op;
6833}
6834
6835SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6836 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6837 EVT VT = Op.getValueType();
6838 assert(VT == MVT::f16);
6839
6840 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6841 EVT ExpVT = Exp.getValueType();
6842 if (ExpVT == MVT::i16)
6843 return Op;
6844
6845 SDLoc DL(Op);
6846
6847 // Correct the exponent type for f16 to i16.
6848 // Clamp the range of the exponent to the instruction's range.
6849
6850 // TODO: This should be a generic narrowing legalization, and can easily be
6851 // for GlobalISel.
6852
6853 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
6854 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6855
6856 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
6857 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6858
6859 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6860
6861 if (IsStrict) {
6862 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6863 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6864 }
6865
6866 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6867}
6868
6870 switch (Op->getOpcode()) {
6871 case ISD::SRA:
6872 case ISD::SMIN:
6873 case ISD::SMAX:
6874 return ISD::SIGN_EXTEND;
6875 case ISD::SRL:
6876 case ISD::UMIN:
6877 case ISD::UMAX:
6878 return ISD::ZERO_EXTEND;
6879 case ISD::ADD:
6880 case ISD::SUB:
6881 case ISD::AND:
6882 case ISD::OR:
6883 case ISD::XOR:
6884 case ISD::SHL:
6885 case ISD::SELECT:
6886 case ISD::MUL:
6887 // operation result won't be influenced by garbage high bits.
6888 // TODO: are all of those cases correct, and are there more?
6889 return ISD::ANY_EXTEND;
6890 case ISD::SETCC: {
6891 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6893 }
6894 default:
6895 llvm_unreachable("unexpected opcode!");
6896 }
6897}
6898
6899SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6900 DAGCombinerInfo &DCI) const {
6901 const unsigned Opc = Op.getOpcode();
6902 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6903 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6904 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6905 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6906 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6907
6908 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6909 : Op->getOperand(0).getValueType();
6910 auto ExtTy = OpTy.changeElementType(MVT::i32);
6911
6912 if (DCI.isBeforeLegalizeOps() ||
6913 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6914 return SDValue();
6915
6916 auto &DAG = DCI.DAG;
6917
6918 SDLoc DL(Op);
6919 SDValue LHS;
6920 SDValue RHS;
6921 if (Opc == ISD::SELECT) {
6922 LHS = Op->getOperand(1);
6923 RHS = Op->getOperand(2);
6924 } else {
6925 LHS = Op->getOperand(0);
6926 RHS = Op->getOperand(1);
6927 }
6928
6929 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6930 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6931
6932 // Special case: for shifts, the RHS always needs a zext.
6933 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6934 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6935 else
6936 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6937
6938 // setcc always return i1/i1 vec so no need to truncate after.
6939 if (Opc == ISD::SETCC) {
6940 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6941 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6942 }
6943
6944 // For other ops, we extend the operation's return type as well so we need to
6945 // truncate back to the original type.
6946 SDValue NewVal;
6947 if (Opc == ISD::SELECT)
6948 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6949 else
6950 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6951
6952 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6953}
6954
6955// Custom lowering for vector multiplications and s_mul_u64.
6956SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6957 EVT VT = Op.getValueType();
6958
6959 // Split vector operands.
6960 if (VT.isVector())
6961 return splitBinaryVectorOp(Op, DAG);
6962
6963 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6964
6965 // There are four ways to lower s_mul_u64:
6966 //
6967 // 1. If all the operands are uniform, then we lower it as it is.
6968 //
6969 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6970 // multiplications because there is not a vector equivalent of s_mul_u64.
6971 //
6972 // 3. If the cost model decides that it is more efficient to use vector
6973 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6974 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6975 //
6976 // 4. If the cost model decides to use vector registers and both of the
6977 // operands are zero-extended/sign-extended from 32-bits, then we split the
6978 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6979 // possible to check if the operands are zero-extended or sign-extended in
6980 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6981 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6982 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6983 // If the cost model decides that we have to use vector registers, then
6984 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6985 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6986 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6987 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6988 // SIInstrInfo.cpp .
6989
6990 if (Op->isDivergent())
6991 return SDValue();
6992
6993 SDValue Op0 = Op.getOperand(0);
6994 SDValue Op1 = Op.getOperand(1);
6995 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6996 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6997 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6998 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6999 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7000 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7001 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7002 SDLoc SL(Op);
7003 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7004 return SDValue(
7005 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7006 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7007 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7008 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7009 return SDValue(
7010 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7011 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7012 return Op;
7013}
7014
7015SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7016 EVT VT = Op.getValueType();
7017 SDLoc SL(Op);
7018 SDValue LHS = Op.getOperand(0);
7019 SDValue RHS = Op.getOperand(1);
7020 bool isSigned = Op.getOpcode() == ISD::SMULO;
7021
7022 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7023 const APInt &C = RHSC->getAPIntValue();
7024 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7025 if (C.isPowerOf2()) {
7026 // smulo(x, signed_min) is same as umulo(x, signed_min).
7027 bool UseArithShift = isSigned && !C.isMinSignedValue();
7028 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7029 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7030 SDValue Overflow =
7031 DAG.getSetCC(SL, MVT::i1,
7032 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7033 Result, ShiftAmt),
7034 LHS, ISD::SETNE);
7035 return DAG.getMergeValues({Result, Overflow}, SL);
7036 }
7037 }
7038
7039 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7040 SDValue Top =
7041 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7042
7043 SDValue Sign = isSigned
7044 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7045 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7046 SL, MVT::i32))
7047 : DAG.getConstant(0, SL, VT);
7048 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7049
7050 return DAG.getMergeValues({Result, Overflow}, SL);
7051}
7052
7053SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7054 if (Op->isDivergent()) {
7055 // Select to V_MAD_[IU]64_[IU]32.
7056 return Op;
7057 }
7058 if (Subtarget->hasSMulHi()) {
7059 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7060 return SDValue();
7061 }
7062 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7063 // calculate the high part, so we might as well do the whole thing with
7064 // V_MAD_[IU]64_[IU]32.
7065 return Op;
7066}
7067
7068SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7069 if (!Subtarget->isTrapHandlerEnabled() ||
7071 return lowerTrapEndpgm(Op, DAG);
7072
7073 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7074 : lowerTrapHsaQueuePtr(Op, DAG);
7075}
7076
7077SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7078 SDLoc SL(Op);
7079 SDValue Chain = Op.getOperand(0);
7080 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7081}
7082
7083SDValue
7084SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7085 const SDLoc &DL, Align Alignment,
7086 ImplicitParameter Param) const {
7089 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7091 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7094}
7095
7096SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7097 SelectionDAG &DAG) const {
7098 SDLoc SL(Op);
7099 SDValue Chain = Op.getOperand(0);
7100
7101 SDValue QueuePtr;
7102 // For code object version 5, QueuePtr is passed through implicit kernarg.
7103 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7105 QueuePtr =
7106 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7107 } else {
7110 Register UserSGPR = Info->getQueuePtrUserSGPR();
7111
7112 if (UserSGPR == AMDGPU::NoRegister) {
7113 // We probably are in a function incorrectly marked with
7114 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7115 // trap, so just use a null pointer.
7116 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7117 } else {
7118 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7119 MVT::i64);
7120 }
7121 }
7122
7123 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7124 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7125
7127 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7128 ToReg.getValue(1)};
7129 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7130}
7131
7132SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7133 SDLoc SL(Op);
7134 SDValue Chain = Op.getOperand(0);
7135
7136 // We need to simulate the 's_trap 2' instruction on targets that run in
7137 // PRIV=1 (where it is treated as a nop).
7138 if (Subtarget->hasPrivEnabledTrap2NopBug())
7139 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7140
7142 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7143 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7144}
7145
7146SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7147 SDLoc SL(Op);
7148 SDValue Chain = Op.getOperand(0);
7150
7151 if (!Subtarget->isTrapHandlerEnabled() ||
7154 "debugtrap handler not supported",
7155 Op.getDebugLoc(), DS_Warning);
7156 LLVMContext &Ctx = MF.getFunction().getContext();
7157 Ctx.diagnose(NoTrap);
7158 return Chain;
7159 }
7160
7161 uint64_t TrapID =
7163 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7164 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7165}
7166
7167SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7168 SelectionDAG &DAG) const {
7169 if (Subtarget->hasApertureRegs()) {
7170 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7171 ? AMDGPU::SRC_SHARED_BASE
7172 : AMDGPU::SRC_PRIVATE_BASE;
7173 // Note: this feature (register) is broken. When used as a 32-bit operand,
7174 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7175 // bits.
7176 //
7177 // To work around the issue, directly emit a 64 bit mov from this register
7178 // then extract the high bits. Note that this shouldn't even result in a
7179 // shift being emitted and simply become a pair of registers (e.g.):
7180 // s_mov_b64 s[6:7], src_shared_base
7181 // v_mov_b32_e32 v1, s7
7182 //
7183 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7184 // coalescing would kick in and it would think it's okay to use the "HI"
7185 // subregister directly (instead of extracting the HI 32 bits) which is an
7186 // artificial (unusable) register.
7187 // Register TableGen definitions would need an overhaul to get rid of the
7188 // artificial "HI" aperture registers and prevent this kind of issue from
7189 // happening.
7190 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7191 DAG.getRegister(ApertureRegNo, MVT::i64));
7192 return DAG.getNode(
7193 ISD::TRUNCATE, DL, MVT::i32,
7194 DAG.getNode(ISD::SRL, DL, MVT::i64,
7195 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7196 }
7197
7198 // For code object version 5, private_base and shared_base are passed through
7199 // implicit kernargs.
7200 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7204 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7205 }
7206
7209 Register UserSGPR = Info->getQueuePtrUserSGPR();
7210 if (UserSGPR == AMDGPU::NoRegister) {
7211 // We probably are in a function incorrectly marked with
7212 // amdgpu-no-queue-ptr. This is undefined.
7213 return DAG.getUNDEF(MVT::i32);
7214 }
7215
7216 SDValue QueuePtr =
7217 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7218
7219 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7220 // private_segment_aperture_base_hi.
7221 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7222
7223 SDValue Ptr =
7224 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7225
7226 // TODO: Use custom target PseudoSourceValue.
7227 // TODO: We should use the value from the IR intrinsic call, but it might not
7228 // be available and how do we get it?
7230 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7231 commonAlignment(Align(64), StructOffset),
7234}
7235
7236/// Return true if the value is a known valid address, such that a null check is
7237/// not necessary.
7239 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7240 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7241 isa<BasicBlockSDNode>(Val))
7242 return true;
7243
7244 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7245 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7246
7247 // TODO: Search through arithmetic, handle arguments and loads
7248 // marked nonnull.
7249 return false;
7250}
7251
7252SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7253 SelectionDAG &DAG) const {
7254 SDLoc SL(Op);
7255
7256 const AMDGPUTargetMachine &TM =
7257 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7258
7259 unsigned DestAS, SrcAS;
7260 SDValue Src;
7261 bool IsNonNull = false;
7262 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7263 SrcAS = ASC->getSrcAddressSpace();
7264 Src = ASC->getOperand(0);
7265 DestAS = ASC->getDestAddressSpace();
7266 } else {
7267 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7268 Op.getConstantOperandVal(0) ==
7269 Intrinsic::amdgcn_addrspacecast_nonnull);
7270 Src = Op->getOperand(1);
7271 SrcAS = Op->getConstantOperandVal(2);
7272 DestAS = Op->getConstantOperandVal(3);
7273 IsNonNull = true;
7274 }
7275
7276 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7277
7278 // flat -> local/private
7279 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7280 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7281 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7282 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7283
7284 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7285 return Ptr;
7286
7287 unsigned NullVal = TM.getNullPointerValue(DestAS);
7288 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7289 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7290
7291 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7292 SegmentNullPtr);
7293 }
7294 }
7295
7296 // local/private -> flat
7297 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7298 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7299 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7300
7301 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7302 SDValue CvtPtr =
7303 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7304 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7305
7306 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7307 return CvtPtr;
7308
7309 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7310 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7311
7312 SDValue NonNull =
7313 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7314
7315 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7316 FlatNullPtr);
7317 }
7318 }
7319
7320 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7321 Op.getValueType() == MVT::i64) {
7324 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7325 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7326 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7327 }
7328
7329 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7330 Src.getValueType() == MVT::i64)
7331 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7332
7333 // global <-> flat are no-ops and never emitted.
7334
7335 const MachineFunction &MF = DAG.getMachineFunction();
7336 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7337 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7338 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7339
7340 return DAG.getUNDEF(Op->getValueType(0));
7341}
7342
7343// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7344// the small vector and inserting them into the big vector. That is better than
7345// the default expansion of doing it via a stack slot. Even though the use of
7346// the stack slot would be optimized away afterwards, the stack slot itself
7347// remains.
7348SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7349 SelectionDAG &DAG) const {
7350 SDValue Vec = Op.getOperand(0);
7351 SDValue Ins = Op.getOperand(1);
7352 SDValue Idx = Op.getOperand(2);
7353 EVT VecVT = Vec.getValueType();
7354 EVT InsVT = Ins.getValueType();
7355 EVT EltVT = VecVT.getVectorElementType();
7356 unsigned InsNumElts = InsVT.getVectorNumElements();
7357 unsigned IdxVal = Idx->getAsZExtVal();
7358 SDLoc SL(Op);
7359
7360 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7361 // Insert 32-bit registers at a time.
7362 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7363
7364 unsigned VecNumElts = VecVT.getVectorNumElements();
7365 EVT NewVecVT =
7366 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7367 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7369 MVT::i32, InsNumElts / 2);
7370
7371 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7372 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7373
7374 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7375 SDValue Elt;
7376 if (InsNumElts == 2) {
7377 Elt = Ins;
7378 } else {
7379 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7380 DAG.getConstant(I, SL, MVT::i32));
7381 }
7382 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7383 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7384 }
7385
7386 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7387 }
7388
7389 for (unsigned I = 0; I != InsNumElts; ++I) {
7390 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7391 DAG.getConstant(I, SL, MVT::i32));
7392 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7393 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7394 }
7395 return Vec;
7396}
7397
7398SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7399 SelectionDAG &DAG) const {
7400 SDValue Vec = Op.getOperand(0);
7401 SDValue InsVal = Op.getOperand(1);
7402 SDValue Idx = Op.getOperand(2);
7403 EVT VecVT = Vec.getValueType();
7404 EVT EltVT = VecVT.getVectorElementType();
7405 unsigned VecSize = VecVT.getSizeInBits();
7406 unsigned EltSize = EltVT.getSizeInBits();
7407 SDLoc SL(Op);
7408
7409 // Specially handle the case of v4i16 with static indexing.
7410 unsigned NumElts = VecVT.getVectorNumElements();
7411 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7412 if (NumElts == 4 && EltSize == 16 && KIdx) {
7413 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7414
7415 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7416 DAG.getConstant(0, SL, MVT::i32));
7417 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7418 DAG.getConstant(1, SL, MVT::i32));
7419
7420 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7421 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7422
7423 unsigned Idx = KIdx->getZExtValue();
7424 bool InsertLo = Idx < 2;
7425 SDValue InsHalf = DAG.getNode(
7426 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7427 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7428 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7429
7430 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7431
7432 SDValue Concat =
7433 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7434 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7435
7436 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7437 }
7438
7439 // Static indexing does not lower to stack access, and hence there is no need
7440 // for special custom lowering to avoid stack access.
7441 if (isa<ConstantSDNode>(Idx))
7442 return SDValue();
7443
7444 // Avoid stack access for dynamic indexing by custom lowering to
7445 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7446
7447 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7448
7449 MVT IntVT = MVT::getIntegerVT(VecSize);
7450
7451 // Convert vector index to bit-index and get the required bit mask.
7452 assert(isPowerOf2_32(EltSize));
7453 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7454 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7455 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7456 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7457 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7458
7459 // 1. Create a congruent vector with the target value in each element.
7460 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7461 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7462
7463 // 2. Mask off all other indices except the required index within (1).
7464 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7465
7466 // 3. Mask off the required index within the target vector.
7467 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7468 SDValue RHS =
7469 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7470
7471 // 4. Get (2) and (3) ORed into the target vector.
7472 SDValue BFI =
7473 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
7474
7475 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7476}
7477
7478SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7479 SelectionDAG &DAG) const {
7480 SDLoc SL(Op);
7481
7482 EVT ResultVT = Op.getValueType();
7483 SDValue Vec = Op.getOperand(0);
7484 SDValue Idx = Op.getOperand(1);
7485 EVT VecVT = Vec.getValueType();
7486 unsigned VecSize = VecVT.getSizeInBits();
7487 EVT EltVT = VecVT.getVectorElementType();
7488
7489 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7490
7491 // Make sure we do any optimizations that will make it easier to fold
7492 // source modifiers before obscuring it with bit operations.
7493
7494 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7495 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7496 return Combined;
7497
7498 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7499 SDValue Lo, Hi;
7500 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7501
7502 if (VecSize == 128) {
7503 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7504 Lo = DAG.getBitcast(LoVT,
7505 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7506 DAG.getConstant(0, SL, MVT::i32)));
7507 Hi = DAG.getBitcast(HiVT,
7508 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7509 DAG.getConstant(1, SL, MVT::i32)));
7510 } else if (VecSize == 256) {
7511 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7512 SDValue Parts[4];
7513 for (unsigned P = 0; P < 4; ++P) {
7514 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7515 DAG.getConstant(P, SL, MVT::i32));
7516 }
7517
7518 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7519 Parts[0], Parts[1]));
7520 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7521 Parts[2], Parts[3]));
7522 } else {
7523 assert(VecSize == 512);
7524
7525 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7526 SDValue Parts[8];
7527 for (unsigned P = 0; P < 8; ++P) {
7528 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7529 DAG.getConstant(P, SL, MVT::i32));
7530 }
7531
7532 Lo = DAG.getBitcast(LoVT,
7533 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7534 Parts[0], Parts[1], Parts[2], Parts[3]));
7535 Hi = DAG.getBitcast(HiVT,
7536 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7537 Parts[4], Parts[5], Parts[6], Parts[7]));
7538 }
7539
7540 EVT IdxVT = Idx.getValueType();
7541 unsigned NElem = VecVT.getVectorNumElements();
7542 assert(isPowerOf2_32(NElem));
7543 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7544 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7545 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7546 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7547 }
7548
7549 assert(VecSize <= 64);
7550
7551 MVT IntVT = MVT::getIntegerVT(VecSize);
7552
7553 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7554 SDValue VecBC = peekThroughBitcasts(Vec);
7555 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7556 SDValue Src = VecBC.getOperand(0);
7557 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7558 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7559 }
7560
7561 unsigned EltSize = EltVT.getSizeInBits();
7562 assert(isPowerOf2_32(EltSize));
7563
7564 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7565
7566 // Convert vector index to bit-index (* EltSize)
7567 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7568
7569 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7570 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7571
7572 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7573 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7574 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7575 }
7576
7577 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7578}
7579
7580static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7581 assert(Elt % 2 == 0);
7582 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7583}
7584
7585SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7586 SelectionDAG &DAG) const {
7587 SDLoc SL(Op);
7588 EVT ResultVT = Op.getValueType();
7589 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7590 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7591 MVT PackVT = MVT::getVectorVT(EltVT, 2);
7592 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7593
7594 // vector_shuffle <0,1,6,7> lhs, rhs
7595 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7596 //
7597 // vector_shuffle <6,7,2,3> lhs, rhs
7598 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7599 //
7600 // vector_shuffle <6,7,0,1> lhs, rhs
7601 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7602
7603 // Avoid scalarizing when both halves are reading from consecutive elements.
7605 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7606 if (elementPairIsContiguous(SVN->getMask(), I)) {
7607 const int Idx = SVN->getMaskElt(I);
7608 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7609 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7610 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7611 SVN->getOperand(VecIdx),
7612 DAG.getConstant(EltIdx, SL, MVT::i32));
7613 Pieces.push_back(SubVec);
7614 } else {
7615 const int Idx0 = SVN->getMaskElt(I);
7616 const int Idx1 = SVN->getMaskElt(I + 1);
7617 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7618 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7619 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7620 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7621
7622 SDValue Vec0 = SVN->getOperand(VecIdx0);
7623 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
7624 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
7625
7626 SDValue Vec1 = SVN->getOperand(VecIdx1);
7627 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
7628 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
7629 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
7630 }
7631 }
7632
7633 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7634}
7635
7636SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7637 SelectionDAG &DAG) const {
7638 SDValue SVal = Op.getOperand(0);
7639 EVT ResultVT = Op.getValueType();
7640 EVT SValVT = SVal.getValueType();
7641 SDValue UndefVal = DAG.getUNDEF(SValVT);
7642 SDLoc SL(Op);
7643
7645 VElts.push_back(SVal);
7646 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7647 VElts.push_back(UndefVal);
7648
7649 return DAG.getBuildVector(ResultVT, SL, VElts);
7650}
7651
7652SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7653 SelectionDAG &DAG) const {
7654 SDLoc SL(Op);
7655 EVT VT = Op.getValueType();
7656
7657 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7658 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7659
7660 SDValue Lo = Op.getOperand(0);
7661 SDValue Hi = Op.getOperand(1);
7662
7663 // Avoid adding defined bits with the zero_extend.
7664 if (Hi.isUndef()) {
7665 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7666 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7667 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7668 }
7669
7670 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7671 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7672
7673 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7674 DAG.getConstant(16, SL, MVT::i32));
7675 if (Lo.isUndef())
7676 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7677
7678 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7679 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7680
7681 SDValue Or =
7682 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
7683 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7684 }
7685
7686 // Split into 2-element chunks.
7687 const unsigned NumParts = VT.getVectorNumElements() / 2;
7689 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7690
7692 for (unsigned P = 0; P < NumParts; ++P) {
7693 SDValue Vec = DAG.getBuildVector(
7694 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7695 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7696 }
7697
7698 SDValue Blend =
7699 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7700 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7701}
7702
7704 const GlobalAddressSDNode *GA) const {
7705 // OSes that use ELF REL relocations (instead of RELA) can only store a
7706 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7707 // which can create arbitrary 64-bit addends. (This is only a problem for
7708 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7709 // the high 32 bits of the addend.)
7710 //
7711 // This should be kept in sync with how HasRelocationAddend is initialized in
7712 // the constructor of ELFAMDGPUAsmBackend.
7713 if (!Subtarget->isAmdHsaOS())
7714 return false;
7715
7716 // We can fold offsets for anything that doesn't require a GOT relocation.
7717 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7721}
7722
7723static SDValue
7725 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7726 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7727 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7728 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7729 // lowered to the following code sequence:
7730 //
7731 // For constant address space:
7732 // s_getpc_b64 s[0:1]
7733 // s_add_u32 s0, s0, $symbol
7734 // s_addc_u32 s1, s1, 0
7735 //
7736 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7737 // a fixup or relocation is emitted to replace $symbol with a literal
7738 // constant, which is a pc-relative offset from the encoding of the $symbol
7739 // operand to the global variable.
7740 //
7741 // For global address space:
7742 // s_getpc_b64 s[0:1]
7743 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7744 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7745 //
7746 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7747 // fixups or relocations are emitted to replace $symbol@*@lo and
7748 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7749 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7750 // operand to the global variable.
7751 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7752 SDValue PtrHi;
7753 if (GAFlags == SIInstrInfo::MO_NONE)
7754 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7755 else
7756 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7757 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7758}
7759
7760SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7761 SDValue Op,
7762 SelectionDAG &DAG) const {
7763 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7764 SDLoc DL(GSD);
7765 EVT PtrVT = Op.getValueType();
7766
7767 const GlobalValue *GV = GSD->getGlobal();
7773 GV->hasExternalLinkage()) {
7774 Type *Ty = GV->getValueType();
7775 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7776 // zero-sized type in other languages to declare the dynamic shared
7777 // memory which size is not known at the compile time. They will be
7778 // allocated by the runtime and placed directly after the static
7779 // allocated ones. They all share the same offset.
7780 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7781 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7782 // Adjust alignment for that dynamic shared memory array.
7784 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7785 MFI->setUsesDynamicLDS(true);
7786 return SDValue(
7787 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7788 }
7789 }
7791 }
7792
7794 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7796 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7797 }
7798
7799 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7800 SDValue AddrLo = DAG.getTargetGlobalAddress(
7801 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7802 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7803
7804 SDValue AddrHi = DAG.getTargetGlobalAddress(
7805 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7806 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7807
7808 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7809 }
7810
7811 if (shouldEmitFixup(GV))
7812 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7813
7814 if (shouldEmitPCReloc(GV))
7815 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7817
7818 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7820
7821 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7823 const DataLayout &DataLayout = DAG.getDataLayout();
7824 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7825 MachinePointerInfo PtrInfo =
7827
7828 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7831}
7832
7834 const SDLoc &DL, SDValue V) const {
7835 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7836 // the destination register.
7837 //
7838 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7839 // so we will end up with redundant moves to m0.
7840 //
7841 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7842
7843 // A Null SDValue creates a glue result.
7844 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7845 V, Chain);
7846 return SDValue(M0, 0);
7847}
7848
7849SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
7850 MVT VT,
7851 unsigned Offset) const {
7852 SDLoc SL(Op);
7853 SDValue Param = lowerKernargMemParameter(
7854 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7855 // The local size values will have the hi 16-bits as zero.
7856 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7857 DAG.getValueType(VT));
7858}
7859
7861 EVT VT) {
7863 "non-hsa intrinsic with hsa target",
7864 DL.getDebugLoc());
7865 DAG.getContext()->diagnose(BadIntrin);
7866 return DAG.getUNDEF(VT);
7867}
7868
7870 EVT VT) {
7872 "intrinsic not supported on subtarget",
7873 DL.getDebugLoc());
7874 DAG.getContext()->diagnose(BadIntrin);
7875 return DAG.getUNDEF(VT);
7876}
7877
7879 ArrayRef<SDValue> Elts) {
7880 assert(!Elts.empty());
7881 MVT Type;
7882 unsigned NumElts = Elts.size();
7883
7884 if (NumElts <= 12) {
7885 Type = MVT::getVectorVT(MVT::f32, NumElts);
7886 } else {
7887 assert(Elts.size() <= 16);
7888 Type = MVT::v16f32;
7889 NumElts = 16;
7890 }
7891
7892 SmallVector<SDValue, 16> VecElts(NumElts);
7893 for (unsigned i = 0; i < Elts.size(); ++i) {
7894 SDValue Elt = Elts[i];
7895 if (Elt.getValueType() != MVT::f32)
7896 Elt = DAG.getBitcast(MVT::f32, Elt);
7897 VecElts[i] = Elt;
7898 }
7899 for (unsigned i = Elts.size(); i < NumElts; ++i)
7900 VecElts[i] = DAG.getUNDEF(MVT::f32);
7901
7902 if (NumElts == 1)
7903 return VecElts[0];
7904 return DAG.getBuildVector(Type, DL, VecElts);
7905}
7906
7907static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7908 SDValue Src, int ExtraElts) {
7909 EVT SrcVT = Src.getValueType();
7910
7912
7913 if (SrcVT.isVector())
7914 DAG.ExtractVectorElements(Src, Elts);
7915 else
7916 Elts.push_back(Src);
7917
7918 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7919 while (ExtraElts--)
7920 Elts.push_back(Undef);
7921
7922 return DAG.getBuildVector(CastVT, DL, Elts);
7923}
7924
7925// Re-construct the required return value for a image load intrinsic.
7926// This is more complicated due to the optional use TexFailCtrl which means the
7927// required return type is an aggregate
7929 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7930 bool Unpacked, bool IsD16, int DMaskPop,
7931 int NumVDataDwords, bool IsAtomicPacked16Bit,
7932 const SDLoc &DL) {
7933 // Determine the required return type. This is the same regardless of
7934 // IsTexFail flag
7935 EVT ReqRetVT = ResultTypes[0];
7936 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7937 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7938 ? (ReqRetNumElts + 1) / 2
7939 : ReqRetNumElts;
7940
7941 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7942
7943 MVT DataDwordVT =
7944 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7945
7946 MVT MaskPopVT =
7947 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7948
7949 SDValue Data(Result, 0);
7950 SDValue TexFail;
7951
7952 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7953 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7954 if (MaskPopVT.isVector()) {
7955 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7956 SDValue(Result, 0), ZeroIdx);
7957 } else {
7958 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7959 SDValue(Result, 0), ZeroIdx);
7960 }
7961 }
7962
7963 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7964 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7965 NumDataDwords - MaskPopDwords);
7966
7967 if (IsD16)
7968 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7969
7970 EVT LegalReqRetVT = ReqRetVT;
7971 if (!ReqRetVT.isVector()) {
7972 if (!Data.getValueType().isInteger())
7973 Data = DAG.getNode(ISD::BITCAST, DL,
7974 Data.getValueType().changeTypeToInteger(), Data);
7975 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7976 } else {
7977 // We need to widen the return vector to a legal type
7978 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7979 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7980 LegalReqRetVT =
7982 ReqRetVT.getVectorNumElements() + 1);
7983 }
7984 }
7985 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7986
7987 if (IsTexFail) {
7988 TexFail =
7989 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7990 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7991
7992 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7993 }
7994
7995 if (Result->getNumValues() == 1)
7996 return Data;
7997
7998 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7999}
8000
8001static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8002 SDValue *LWE, bool &IsTexFail) {
8003 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8004
8005 uint64_t Value = TexFailCtrlConst->getZExtValue();
8006 if (Value) {
8007 IsTexFail = true;
8008 }
8009
8010 SDLoc DL(TexFailCtrlConst);
8011 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8012 Value &= ~(uint64_t)0x1;
8013 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8014 Value &= ~(uint64_t)0x2;
8015
8016 return Value == 0;
8017}
8018
8020 MVT PackVectorVT,
8021 SmallVectorImpl<SDValue> &PackedAddrs,
8022 unsigned DimIdx, unsigned EndIdx,
8023 unsigned NumGradients) {
8024 SDLoc DL(Op);
8025 for (unsigned I = DimIdx; I < EndIdx; I++) {
8026 SDValue Addr = Op.getOperand(I);
8027
8028 // Gradients are packed with undef for each coordinate.
8029 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8030 // 1D: undef,dx/dh; undef,dx/dv
8031 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8032 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8033 if (((I + 1) >= EndIdx) ||
8034 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8035 I == DimIdx + NumGradients - 1))) {
8036 if (Addr.getValueType() != MVT::i16)
8037 Addr = DAG.getBitcast(MVT::i16, Addr);
8038 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8039 } else {
8040 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8041 I++;
8042 }
8043 Addr = DAG.getBitcast(MVT::f32, Addr);
8044 PackedAddrs.push_back(Addr);
8045 }
8046}
8047
8048SDValue SITargetLowering::lowerImage(SDValue Op,
8050 SelectionDAG &DAG, bool WithChain) const {
8051 SDLoc DL(Op);
8053 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8054 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8056 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8057 unsigned IntrOpcode = Intr->BaseOpcode;
8058 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8059 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8060 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8061
8062 SmallVector<EVT, 3> ResultTypes(Op->values());
8063 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8064 bool IsD16 = false;
8065 bool IsG16 = false;
8066 bool IsA16 = false;
8067 SDValue VData;
8068 int NumVDataDwords = 0;
8069 bool AdjustRetType = false;
8070 bool IsAtomicPacked16Bit = false;
8071
8072 // Offset of intrinsic arguments
8073 const unsigned ArgOffset = WithChain ? 2 : 1;
8074
8075 unsigned DMask;
8076 unsigned DMaskLanes = 0;
8077
8078 if (BaseOpcode->Atomic) {
8079 VData = Op.getOperand(2);
8080
8081 IsAtomicPacked16Bit =
8082 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8083 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8084
8085 bool Is64Bit = VData.getValueSizeInBits() == 64;
8086 if (BaseOpcode->AtomicX2) {
8087 SDValue VData2 = Op.getOperand(3);
8088 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8089 {VData, VData2});
8090 if (Is64Bit)
8091 VData = DAG.getBitcast(MVT::v4i32, VData);
8092
8093 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8094 DMask = Is64Bit ? 0xf : 0x3;
8095 NumVDataDwords = Is64Bit ? 4 : 2;
8096 } else {
8097 DMask = Is64Bit ? 0x3 : 0x1;
8098 NumVDataDwords = Is64Bit ? 2 : 1;
8099 }
8100 } else {
8101 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8102 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8103
8104 if (BaseOpcode->Store) {
8105 VData = Op.getOperand(2);
8106
8107 MVT StoreVT = VData.getSimpleValueType();
8108 if (StoreVT.getScalarType() == MVT::f16) {
8109 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8110 return Op; // D16 is unsupported for this instruction
8111
8112 IsD16 = true;
8113 VData = handleD16VData(VData, DAG, true);
8114 }
8115
8116 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8117 } else if (!BaseOpcode->NoReturn) {
8118 // Work out the num dwords based on the dmask popcount and underlying type
8119 // and whether packing is supported.
8120 MVT LoadVT = ResultTypes[0].getSimpleVT();
8121 if (LoadVT.getScalarType() == MVT::f16) {
8122 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8123 return Op; // D16 is unsupported for this instruction
8124
8125 IsD16 = true;
8126 }
8127
8128 // Confirm that the return type is large enough for the dmask specified
8129 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8130 (!LoadVT.isVector() && DMaskLanes > 1))
8131 return Op;
8132
8133 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8134 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8135 // instructions.
8136 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8137 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8138 NumVDataDwords = (DMaskLanes + 1) / 2;
8139 else
8140 NumVDataDwords = DMaskLanes;
8141
8142 AdjustRetType = true;
8143 }
8144 }
8145
8146 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8148
8149 // Check for 16 bit addresses or derivatives and pack if true.
8150 MVT VAddrVT =
8151 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8152 MVT VAddrScalarVT = VAddrVT.getScalarType();
8153 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8154 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8155
8156 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8157 VAddrScalarVT = VAddrVT.getScalarType();
8158 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8159 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8160
8161 // Push back extra arguments.
8162 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8163 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8164 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8165 // Special handling of bias when A16 is on. Bias is of type half but
8166 // occupies full 32-bit.
8167 SDValue Bias = DAG.getBuildVector(
8168 MVT::v2f16, DL,
8169 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8170 VAddrs.push_back(Bias);
8171 } else {
8172 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8173 "Bias needs to be converted to 16 bit in A16 mode");
8174 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8175 }
8176 }
8177
8178 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8179 // 16 bit gradients are supported, but are tied to the A16 control
8180 // so both gradients and addresses must be 16 bit
8181 LLVM_DEBUG(
8182 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8183 "require 16 bit args for both gradients and addresses");
8184 return Op;
8185 }
8186
8187 if (IsA16) {
8188 if (!ST->hasA16()) {
8189 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8190 "support 16 bit addresses\n");
8191 return Op;
8192 }
8193 }
8194
8195 // We've dealt with incorrect input so we know that if IsA16, IsG16
8196 // are set then we have to compress/pack operands (either address,
8197 // gradient or both)
8198 // In the case where a16 and gradients are tied (no G16 support) then we
8199 // have already verified that both IsA16 and IsG16 are true
8200 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8201 // Activate g16
8202 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8204 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8205 }
8206
8207 // Add gradients (packed or unpacked)
8208 if (IsG16) {
8209 // Pack the gradients
8210 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8211 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8212 ArgOffset + Intr->GradientStart,
8213 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8214 } else {
8215 for (unsigned I = ArgOffset + Intr->GradientStart;
8216 I < ArgOffset + Intr->CoordStart; I++)
8217 VAddrs.push_back(Op.getOperand(I));
8218 }
8219
8220 // Add addresses (packed or unpacked)
8221 if (IsA16) {
8222 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8223 ArgOffset + Intr->CoordStart, VAddrEnd,
8224 0 /* No gradients */);
8225 } else {
8226 // Add uncompressed address
8227 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8228 VAddrs.push_back(Op.getOperand(I));
8229 }
8230
8231 // If the register allocator cannot place the address registers contiguously
8232 // without introducing moves, then using the non-sequential address encoding
8233 // is always preferable, since it saves VALU instructions and is usually a
8234 // wash in terms of code size or even better.
8235 //
8236 // However, we currently have no way of hinting to the register allocator that
8237 // MIMG addresses should be placed contiguously when it is possible to do so,
8238 // so force non-NSA for the common 2-address case as a heuristic.
8239 //
8240 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8241 // allocation when possible.
8242 //
8243 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8244 // set of the remaining addresses.
8245 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8246 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8247 const bool UseNSA = ST->hasNSAEncoding() &&
8248 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8249 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8250 const bool UsePartialNSA =
8251 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8252
8253 SDValue VAddr;
8254 if (UsePartialNSA) {
8255 VAddr = getBuildDwordsVector(DAG, DL,
8256 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8257 } else if (!UseNSA) {
8258 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8259 }
8260
8261 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8262 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8263 SDValue Unorm;
8264 if (!BaseOpcode->Sampler) {
8265 Unorm = True;
8266 } else {
8267 uint64_t UnormConst =
8268 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8269
8270 Unorm = UnormConst ? True : False;
8271 }
8272
8273 SDValue TFE;
8274 SDValue LWE;
8275 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8276 bool IsTexFail = false;
8277 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8278 return Op;
8279
8280 if (IsTexFail) {
8281 if (!DMaskLanes) {
8282 // Expecting to get an error flag since TFC is on - and dmask is 0
8283 // Force dmask to be at least 1 otherwise the instruction will fail
8284 DMask = 0x1;
8285 DMaskLanes = 1;
8286 NumVDataDwords = 1;
8287 }
8288 NumVDataDwords += 1;
8289 AdjustRetType = true;
8290 }
8291
8292 // Has something earlier tagged that the return type needs adjusting
8293 // This happens if the instruction is a load or has set TexFailCtrl flags
8294 if (AdjustRetType) {
8295 // NumVDataDwords reflects the true number of dwords required in the return
8296 // type
8297 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8298 // This is a no-op load. This can be eliminated
8299 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8300 if (isa<MemSDNode>(Op))
8301 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8302 return Undef;
8303 }
8304
8305 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8306 MVT::i32, NumVDataDwords)
8307 : MVT::i32;
8308
8309 ResultTypes[0] = NewVT;
8310 if (ResultTypes.size() == 3) {
8311 // Original result was aggregate type used for TexFailCtrl results
8312 // The actual instruction returns as a vector type which has now been
8313 // created. Remove the aggregate result.
8314 ResultTypes.erase(&ResultTypes[1]);
8315 }
8316 }
8317
8318 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8319 if (BaseOpcode->Atomic)
8320 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8321 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8323 return Op;
8324
8326 if (BaseOpcode->Store || BaseOpcode->Atomic)
8327 Ops.push_back(VData); // vdata
8328 if (UsePartialNSA) {
8329 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8330 Ops.push_back(VAddr);
8331 } else if (UseNSA)
8332 append_range(Ops, VAddrs);
8333 else
8334 Ops.push_back(VAddr);
8335 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8336 EVT RsrcVT = Rsrc.getValueType();
8337 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8338 return Op;
8339 Ops.push_back(Rsrc);
8340 if (BaseOpcode->Sampler) {
8341 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8342 if (Samp.getValueType() != MVT::v4i32)
8343 return Op;
8344 Ops.push_back(Samp);
8345 }
8346 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8347 if (IsGFX10Plus)
8348 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8349 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8350 Ops.push_back(Unorm);
8351 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8352 Ops.push_back(IsA16 && // r128, a16 for gfx9
8353 ST->hasFeature(AMDGPU::FeatureR128A16)
8354 ? True
8355 : False);
8356 if (IsGFX10Plus)
8357 Ops.push_back(IsA16 ? True : False);
8358 if (!Subtarget->hasGFX90AInsts()) {
8359 Ops.push_back(TFE); // tfe
8360 } else if (TFE->getAsZExtVal()) {
8361 report_fatal_error("TFE is not supported on this GPU");
8362 }
8363 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8364 Ops.push_back(LWE); // lwe
8365 if (!IsGFX10Plus)
8366 Ops.push_back(DimInfo->DA ? True : False);
8367 if (BaseOpcode->HasD16)
8368 Ops.push_back(IsD16 ? True : False);
8369 if (isa<MemSDNode>(Op))
8370 Ops.push_back(Op.getOperand(0)); // chain
8371
8372 int NumVAddrDwords =
8373 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8374 int Opcode = -1;
8375
8376 if (IsGFX12Plus) {
8377 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8378 NumVDataDwords, NumVAddrDwords);
8379 } else if (IsGFX11Plus) {
8380 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8381 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8382 : AMDGPU::MIMGEncGfx11Default,
8383 NumVDataDwords, NumVAddrDwords);
8384 } else if (IsGFX10Plus) {
8385 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8386 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8387 : AMDGPU::MIMGEncGfx10Default,
8388 NumVDataDwords, NumVAddrDwords);
8389 } else {
8390 if (Subtarget->hasGFX90AInsts()) {
8391 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8392 NumVDataDwords, NumVAddrDwords);
8393 if (Opcode == -1)
8395 "requested image instruction is not supported on this GPU");
8396 }
8397 if (Opcode == -1 &&
8399 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8400 NumVDataDwords, NumVAddrDwords);
8401 if (Opcode == -1)
8402 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8403 NumVDataDwords, NumVAddrDwords);
8404 }
8405 if (Opcode == -1)
8406 return Op;
8407
8408 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8409 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8410 MachineMemOperand *MemRef = MemOp->getMemOperand();
8411 DAG.setNodeMemRefs(NewNode, {MemRef});
8412 }
8413
8414 if (BaseOpcode->AtomicX2) {
8416 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8417 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8418 }
8419 if (BaseOpcode->NoReturn)
8420 return SDValue(NewNode, 0);
8421 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8422 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8423 NumVDataDwords, IsAtomicPacked16Bit, DL);
8424}
8425
8426SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8427 SDValue Offset, SDValue CachePolicy,
8428 SelectionDAG &DAG) const {
8430
8431 const DataLayout &DataLayout = DAG.getDataLayout();
8432 Align Alignment =
8434
8439 VT.getStoreSize(), Alignment);
8440
8441 if (!Offset->isDivergent()) {
8442 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8443
8444 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8445 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8446 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8447 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8448 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8449 SDValue BufferLoad =
8451 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8452 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8453 }
8454
8455 // Widen vec3 load to vec4.
8456 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8457 !Subtarget->hasScalarDwordx3Loads()) {
8458 EVT WidenedVT =
8460 auto WidenedOp = DAG.getMemIntrinsicNode(
8461 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8462 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8463 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8464 DAG.getVectorIdxConstant(0, DL));
8465 return Subvector;
8466 }
8467
8469 DAG.getVTList(VT), Ops, VT, MMO);
8470 }
8471
8472 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8473 // assume that the buffer is unswizzled.
8474 SDValue Ops[] = {
8475 DAG.getEntryNode(), // Chain
8476 Rsrc, // rsrc
8477 DAG.getConstant(0, DL, MVT::i32), // vindex
8478 {}, // voffset
8479 {}, // soffset
8480 {}, // offset
8481 CachePolicy, // cachepolicy
8482 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8483 };
8484 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8485 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8486 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8487 }
8488
8490 unsigned NumLoads = 1;
8491 MVT LoadVT = VT.getSimpleVT();
8492 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8493 assert((LoadVT.getScalarType() == MVT::i32 ||
8494 LoadVT.getScalarType() == MVT::f32));
8495
8496 if (NumElts == 8 || NumElts == 16) {
8497 NumLoads = NumElts / 4;
8498 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8499 }
8500
8501 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8502
8503 // Use the alignment to ensure that the required offsets will fit into the
8504 // immediate offsets.
8505 setBufferOffsets(Offset, DAG, &Ops[3],
8506 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8507
8508 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8509 for (unsigned i = 0; i < NumLoads; ++i) {
8510 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8511 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8512 LoadVT, MMO, DAG));
8513 }
8514
8515 if (NumElts == 8 || NumElts == 16)
8516 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8517
8518 return Loads[0];
8519}
8520
8521SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8522 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8523 if (!Subtarget->hasArchitectedSGPRs())
8524 return {};
8525 SDLoc SL(Op);
8526 MVT VT = MVT::i32;
8527 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8528 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8529 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8530}
8531
8532SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8533 unsigned Dim,
8534 const ArgDescriptor &Arg) const {
8535 SDLoc SL(Op);
8537 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8538 if (MaxID == 0)
8539 return DAG.getConstant(0, SL, MVT::i32);
8540
8541 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8542 SDLoc(DAG.getEntryNode()), Arg);
8543
8544 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8545 // masking operations anyway.
8546 //
8547 // TODO: We could assert the top bit is 0 for the source copy.
8548 if (Arg.isMasked())
8549 return Val;
8550
8551 // Preserve the known bits after expansion to a copy.
8553 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8554 DAG.getValueType(SmallVT));
8555}
8556
8557SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8558 SelectionDAG &DAG) const {
8560 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8561
8562 EVT VT = Op.getValueType();
8563 SDLoc DL(Op);
8564 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8565
8566 // TODO: Should this propagate fast-math-flags?
8567
8568 switch (IntrinsicID) {
8569 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8570 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8571 return emitNonHSAIntrinsicError(DAG, DL, VT);
8572 return getPreloadedValue(DAG, *MFI, VT,
8574 }
8575 case Intrinsic::amdgcn_dispatch_ptr:
8576 case Intrinsic::amdgcn_queue_ptr: {
8577 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8578 DiagnosticInfoUnsupported BadIntrin(
8579 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8580 DL.getDebugLoc());
8581 DAG.getContext()->diagnose(BadIntrin);
8582 return DAG.getUNDEF(VT);
8583 }
8584
8585 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8588 return getPreloadedValue(DAG, *MFI, VT, RegID);
8589 }
8590 case Intrinsic::amdgcn_implicitarg_ptr: {
8591 if (MFI->isEntryFunction())
8592 return getImplicitArgPtr(DAG, DL);
8593 return getPreloadedValue(DAG, *MFI, VT,
8595 }
8596 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8598 // This only makes sense to call in a kernel, so just lower to null.
8599 return DAG.getConstant(0, DL, VT);
8600 }
8601
8602 return getPreloadedValue(DAG, *MFI, VT,
8604 }
8605 case Intrinsic::amdgcn_dispatch_id: {
8606 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8607 }
8608 case Intrinsic::amdgcn_rcp:
8609 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8610 case Intrinsic::amdgcn_rsq:
8611 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8612 case Intrinsic::amdgcn_rsq_legacy:
8614 return emitRemovedIntrinsicError(DAG, DL, VT);
8615 return SDValue();
8616 case Intrinsic::amdgcn_rcp_legacy:
8618 return emitRemovedIntrinsicError(DAG, DL, VT);
8619 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8620 case Intrinsic::amdgcn_rsq_clamp: {
8622 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8623
8624 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8627
8628 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8629 SDValue Tmp =
8630 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
8631 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8632 DAG.getConstantFP(Min, DL, VT));
8633 }
8634 case Intrinsic::r600_read_ngroups_x:
8635 if (Subtarget->isAmdHsaOS())
8636 return emitNonHSAIntrinsicError(DAG, DL, VT);
8637
8638 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8640 false);
8641 case Intrinsic::r600_read_ngroups_y:
8642 if (Subtarget->isAmdHsaOS())
8643 return emitNonHSAIntrinsicError(DAG, DL, VT);
8644
8645 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8647 false);
8648 case Intrinsic::r600_read_ngroups_z:
8649 if (Subtarget->isAmdHsaOS())
8650 return emitNonHSAIntrinsicError(DAG, DL, VT);
8651
8652 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8654 false);
8655 case Intrinsic::r600_read_global_size_x:
8656 if (Subtarget->isAmdHsaOS())
8657 return emitNonHSAIntrinsicError(DAG, DL, VT);
8658
8659 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8661 Align(4), false);
8662 case Intrinsic::r600_read_global_size_y:
8663 if (Subtarget->isAmdHsaOS())
8664 return emitNonHSAIntrinsicError(DAG, DL, VT);
8665
8666 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8668 Align(4), false);
8669 case Intrinsic::r600_read_global_size_z:
8670 if (Subtarget->isAmdHsaOS())
8671 return emitNonHSAIntrinsicError(DAG, DL, VT);
8672
8673 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8675 Align(4), false);
8676 case Intrinsic::r600_read_local_size_x:
8677 if (Subtarget->isAmdHsaOS())
8678 return emitNonHSAIntrinsicError(DAG, DL, VT);
8679
8680 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8682 case Intrinsic::r600_read_local_size_y:
8683 if (Subtarget->isAmdHsaOS())
8684 return emitNonHSAIntrinsicError(DAG, DL, VT);
8685
8686 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8688 case Intrinsic::r600_read_local_size_z:
8689 if (Subtarget->isAmdHsaOS())
8690 return emitNonHSAIntrinsicError(DAG, DL, VT);
8691
8692 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8694 case Intrinsic::amdgcn_workgroup_id_x:
8695 return getPreloadedValue(DAG, *MFI, VT,
8697 case Intrinsic::amdgcn_workgroup_id_y:
8698 return getPreloadedValue(DAG, *MFI, VT,
8700 case Intrinsic::amdgcn_workgroup_id_z:
8701 return getPreloadedValue(DAG, *MFI, VT,
8703 case Intrinsic::amdgcn_wave_id:
8704 return lowerWaveID(DAG, Op);
8705 case Intrinsic::amdgcn_lds_kernel_id: {
8706 if (MFI->isEntryFunction())
8707 return getLDSKernelId(DAG, DL);
8708 return getPreloadedValue(DAG, *MFI, VT,
8710 }
8711 case Intrinsic::amdgcn_workitem_id_x:
8712 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8713 case Intrinsic::amdgcn_workitem_id_y:
8714 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8715 case Intrinsic::amdgcn_workitem_id_z:
8716 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8717 case Intrinsic::amdgcn_wavefrontsize:
8719 SDLoc(Op), MVT::i32);
8720 case Intrinsic::amdgcn_s_buffer_load: {
8721 unsigned CPol = Op.getConstantOperandVal(3);
8722 // s_buffer_load, because of how it's optimized, can't be volatile
8723 // so reject ones with the volatile bit set.
8724 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8727 return Op;
8728 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
8729 Op.getOperand(3), DAG);
8730 }
8731 case Intrinsic::amdgcn_fdiv_fast:
8732 return lowerFDIV_FAST(Op, DAG);
8733 case Intrinsic::amdgcn_sin:
8734 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8735
8736 case Intrinsic::amdgcn_cos:
8737 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8738
8739 case Intrinsic::amdgcn_mul_u24:
8740 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
8741 Op.getOperand(2));
8742 case Intrinsic::amdgcn_mul_i24:
8743 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
8744 Op.getOperand(2));
8745
8746 case Intrinsic::amdgcn_log_clamp: {
8748 return SDValue();
8749
8750 return emitRemovedIntrinsicError(DAG, DL, VT);
8751 }
8752 case Intrinsic::amdgcn_fract:
8753 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8754
8755 case Intrinsic::amdgcn_class:
8756 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
8757 Op.getOperand(2));
8758 case Intrinsic::amdgcn_div_fmas:
8759 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
8760 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8761
8762 case Intrinsic::amdgcn_div_fixup:
8763 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
8764 Op.getOperand(2), Op.getOperand(3));
8765
8766 case Intrinsic::amdgcn_div_scale: {
8767 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8768
8769 // Translate to the operands expected by the machine instruction. The
8770 // first parameter must be the same as the first instruction.
8771 SDValue Numerator = Op.getOperand(1);
8772 SDValue Denominator = Op.getOperand(2);
8773
8774 // Note this order is opposite of the machine instruction's operations,
8775 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8776 // intrinsic has the numerator as the first operand to match a normal
8777 // division operation.
8778
8779 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8780
8781 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8782 Denominator, Numerator);
8783 }
8784 case Intrinsic::amdgcn_icmp: {
8785 // There is a Pat that handles this variant, so return it as-is.
8786 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8787 Op.getConstantOperandVal(2) == 0 &&
8788 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8789 return Op;
8790 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8791 }
8792 case Intrinsic::amdgcn_fcmp: {
8793 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8794 }
8795 case Intrinsic::amdgcn_ballot:
8796 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8797 case Intrinsic::amdgcn_fmed3:
8798 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
8799 Op.getOperand(2), Op.getOperand(3));
8800 case Intrinsic::amdgcn_fdot2:
8801 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
8802 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8803 case Intrinsic::amdgcn_fmul_legacy:
8804 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
8805 Op.getOperand(2));
8806 case Intrinsic::amdgcn_sffbh:
8807 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8808 case Intrinsic::amdgcn_sbfe:
8809 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
8810 Op.getOperand(2), Op.getOperand(3));
8811 case Intrinsic::amdgcn_ubfe:
8812 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
8813 Op.getOperand(2), Op.getOperand(3));
8814 case Intrinsic::amdgcn_cvt_pkrtz:
8815 case Intrinsic::amdgcn_cvt_pknorm_i16:
8816 case Intrinsic::amdgcn_cvt_pknorm_u16:
8817 case Intrinsic::amdgcn_cvt_pk_i16:
8818 case Intrinsic::amdgcn_cvt_pk_u16: {
8819 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8820 EVT VT = Op.getValueType();
8821 unsigned Opcode;
8822
8823 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8825 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8827 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8829 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8831 else
8833
8834 if (isTypeLegal(VT))
8835 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8836
8837 SDValue Node =
8838 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
8839 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8840 }
8841 case Intrinsic::amdgcn_fmad_ftz:
8842 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8843 Op.getOperand(2), Op.getOperand(3));
8844
8845 case Intrinsic::amdgcn_if_break:
8846 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8847 Op->getOperand(1), Op->getOperand(2)),
8848 0);
8849
8850 case Intrinsic::amdgcn_groupstaticsize: {
8852 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8853 return Op;
8854
8855 const Module *M = MF.getFunction().getParent();
8856 const GlobalValue *GV =
8857 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
8858 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8860 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8861 }
8862 case Intrinsic::amdgcn_is_shared:
8863 case Intrinsic::amdgcn_is_private: {
8864 SDLoc SL(Op);
8865 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8868 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8869 SDValue SrcVec =
8870 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8871
8872 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8873 DAG.getConstant(1, SL, MVT::i32));
8874 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8875 }
8876 case Intrinsic::amdgcn_perm:
8877 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8878 Op.getOperand(2), Op.getOperand(3));
8879 case Intrinsic::amdgcn_reloc_constant: {
8880 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8881 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8882 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8883 auto *RelocSymbol = cast<GlobalVariable>(
8884 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8885 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8887 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8888 }
8889 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8890 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8891 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8892 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8893 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8894 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8895 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8896 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8897 if (Op.getOperand(4).getValueType() == MVT::i32)
8898 return SDValue();
8899
8900 SDLoc SL(Op);
8901 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8902 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8903 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8904 Op.getOperand(3), IndexKeyi32);
8905 }
8906 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8907 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8908 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8909 if (Op.getOperand(6).getValueType() == MVT::i32)
8910 return SDValue();
8911
8912 SDLoc SL(Op);
8913 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8914 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8915 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8916 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8917 IndexKeyi32, Op.getOperand(7)});
8918 }
8919 case Intrinsic::amdgcn_addrspacecast_nonnull:
8920 return lowerADDRSPACECAST(Op, DAG);
8921 case Intrinsic::amdgcn_readlane:
8922 case Intrinsic::amdgcn_readfirstlane:
8923 case Intrinsic::amdgcn_writelane:
8924 case Intrinsic::amdgcn_permlane16:
8925 case Intrinsic::amdgcn_permlanex16:
8926 case Intrinsic::amdgcn_permlane64:
8927 case Intrinsic::amdgcn_set_inactive:
8928 case Intrinsic::amdgcn_set_inactive_chain_arg:
8929 case Intrinsic::amdgcn_mov_dpp8:
8930 case Intrinsic::amdgcn_update_dpp:
8931 return lowerLaneOp(*this, Op.getNode(), DAG);
8932 default:
8933 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8935 return lowerImage(Op, ImageDimIntr, DAG, false);
8936
8937 return Op;
8938 }
8939}
8940
8941// On targets not supporting constant in soffset field, turn zero to
8942// SGPR_NULL to avoid generating an extra s_mov with zero.
8944 const GCNSubtarget *Subtarget) {
8945 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8946 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8947 return SOffset;
8948}
8949
8950SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8951 SelectionDAG &DAG,
8952 unsigned NewOpcode) const {
8953 SDLoc DL(Op);
8954
8955 SDValue VData = Op.getOperand(2);
8956 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8957 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
8958 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8959 SDValue Ops[] = {
8960 Op.getOperand(0), // Chain
8961 VData, // vdata
8962 Rsrc, // rsrc
8963 DAG.getConstant(0, DL, MVT::i32), // vindex
8964 VOffset, // voffset
8965 SOffset, // soffset
8966 Offset, // offset
8967 Op.getOperand(6), // cachepolicy
8968 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8969 };
8970
8971 auto *M = cast<MemSDNode>(Op);
8972
8973 EVT MemVT = VData.getValueType();
8974 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8975 M->getMemOperand());
8976}
8977
8978SDValue
8979SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8980 unsigned NewOpcode) const {
8981 SDLoc DL(Op);
8982
8983 SDValue VData = Op.getOperand(2);
8984 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8985 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
8986 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8987 SDValue Ops[] = {
8988 Op.getOperand(0), // Chain
8989 VData, // vdata
8990 Rsrc, // rsrc
8991 Op.getOperand(4), // vindex
8992 VOffset, // voffset
8993 SOffset, // soffset
8994 Offset, // offset
8995 Op.getOperand(7), // cachepolicy
8996 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8997 };
8998
8999 auto *M = cast<MemSDNode>(Op);
9000
9001 EVT MemVT = VData.getValueType();
9002 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9003 M->getMemOperand());
9004}
9005
9006SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9007 SelectionDAG &DAG) const {
9008 unsigned IntrID = Op.getConstantOperandVal(1);
9009 SDLoc DL(Op);
9010
9011 switch (IntrID) {
9012 case Intrinsic::amdgcn_ds_ordered_add:
9013 case Intrinsic::amdgcn_ds_ordered_swap: {
9014 MemSDNode *M = cast<MemSDNode>(Op);
9015 SDValue Chain = M->getOperand(0);
9016 SDValue M0 = M->getOperand(2);
9017 SDValue Value = M->getOperand(3);
9018 unsigned IndexOperand = M->getConstantOperandVal(7);
9019 unsigned WaveRelease = M->getConstantOperandVal(8);
9020 unsigned WaveDone = M->getConstantOperandVal(9);
9021
9022 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9023 IndexOperand &= ~0x3f;
9024 unsigned CountDw = 0;
9025
9026 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9027 CountDw = (IndexOperand >> 24) & 0xf;
9028 IndexOperand &= ~(0xf << 24);
9029
9030 if (CountDw < 1 || CountDw > 4) {
9032 "ds_ordered_count: dword count must be between 1 and 4");
9033 }
9034 }
9035
9036 if (IndexOperand)
9037 report_fatal_error("ds_ordered_count: bad index operand");
9038
9039 if (WaveDone && !WaveRelease)
9040 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
9041
9042 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9043 unsigned ShaderType =
9045 unsigned Offset0 = OrderedCountIndex << 2;
9046 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9047
9048 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9049 Offset1 |= (CountDw - 1) << 6;
9050
9051 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9052 Offset1 |= ShaderType << 2;
9053
9054 unsigned Offset = Offset0 | (Offset1 << 8);
9055
9056 SDValue Ops[] = {
9057 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9058 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9059 };
9061 M->getVTList(), Ops, M->getMemoryVT(),
9062 M->getMemOperand());
9063 }
9064 case Intrinsic::amdgcn_raw_buffer_load:
9065 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9066 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9067 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9068 case Intrinsic::amdgcn_raw_buffer_load_format:
9069 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9070 const bool IsFormat =
9071 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9072 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9073
9074 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9075 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9076 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9077 SDValue Ops[] = {
9078 Op.getOperand(0), // Chain
9079 Rsrc, // rsrc
9080 DAG.getConstant(0, DL, MVT::i32), // vindex
9081 VOffset, // voffset
9082 SOffset, // soffset
9083 Offset, // offset
9084 Op.getOperand(5), // cachepolicy, swizzled buffer
9085 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9086 };
9087
9088 auto *M = cast<MemSDNode>(Op);
9089 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9090 }
9091 case Intrinsic::amdgcn_struct_buffer_load:
9092 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9093 case Intrinsic::amdgcn_struct_buffer_load_format:
9094 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9095 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9096 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9097 const bool IsFormat =
9098 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9099 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9100
9101 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9102 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9103 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9104 SDValue Ops[] = {
9105 Op.getOperand(0), // Chain
9106 Rsrc, // rsrc
9107 Op.getOperand(3), // vindex
9108 VOffset, // voffset
9109 SOffset, // soffset
9110 Offset, // offset
9111 Op.getOperand(6), // cachepolicy, swizzled buffer
9112 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9113 };
9114
9115 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9116 }
9117 case Intrinsic::amdgcn_raw_tbuffer_load:
9118 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9119 MemSDNode *M = cast<MemSDNode>(Op);
9120 EVT LoadVT = Op.getValueType();
9121 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9122 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9123 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9124
9125 SDValue Ops[] = {
9126 Op.getOperand(0), // Chain
9127 Rsrc, // rsrc
9128 DAG.getConstant(0, DL, MVT::i32), // vindex
9129 VOffset, // voffset
9130 SOffset, // soffset
9131 Offset, // offset
9132 Op.getOperand(5), // format
9133 Op.getOperand(6), // cachepolicy, swizzled buffer
9134 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9135 };
9136
9137 if (LoadVT.getScalarType() == MVT::f16)
9138 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9139 Ops);
9140 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9141 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9142 DAG);
9143 }
9144 case Intrinsic::amdgcn_struct_tbuffer_load:
9145 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9146 MemSDNode *M = cast<MemSDNode>(Op);
9147 EVT LoadVT = Op.getValueType();
9148 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9149 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9150 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9151
9152 SDValue Ops[] = {
9153 Op.getOperand(0), // Chain
9154 Rsrc, // rsrc
9155 Op.getOperand(3), // vindex
9156 VOffset, // voffset
9157 SOffset, // soffset
9158 Offset, // offset
9159 Op.getOperand(6), // format
9160 Op.getOperand(7), // cachepolicy, swizzled buffer
9161 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9162 };
9163
9164 if (LoadVT.getScalarType() == MVT::f16)
9165 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9166 Ops);
9167 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9168 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9169 DAG);
9170 }
9171 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9172 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9173 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9174 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9175 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9176 return lowerStructBufferAtomicIntrin(Op, DAG,
9178 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9179 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9180 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9181 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9182 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9183 return lowerStructBufferAtomicIntrin(Op, DAG,
9185 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9186 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9187 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9188 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9189 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9190 return lowerStructBufferAtomicIntrin(Op, DAG,
9192 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9193 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9194 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9195 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9196 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9197 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9198 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9200 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9201 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9202 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9203 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9204 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9205 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9206 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9207 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9208 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9209 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9210 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9211 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9212 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9213 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9214 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9215 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9216 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9217 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9218 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9219 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9220 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9221 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9222 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9223 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9224 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9225 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9226 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9227 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9228 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9229 return lowerRawBufferAtomicIntrin(Op, DAG,
9231 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9232 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9233 return lowerStructBufferAtomicIntrin(Op, DAG,
9235 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9236 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9237 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9238 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9239 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9240 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9241 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9242 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9243 return lowerStructBufferAtomicIntrin(Op, DAG,
9245 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9246 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9247 return lowerStructBufferAtomicIntrin(Op, DAG,
9249 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9250 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9251 return lowerStructBufferAtomicIntrin(Op, DAG,
9253 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9254 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9255 return lowerStructBufferAtomicIntrin(Op, DAG,
9257 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9258 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9259 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9260 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9262 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9263 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9264 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9265 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9266 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9267 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9268 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9269 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9271 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9272 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9273 return lowerStructBufferAtomicIntrin(Op, DAG,
9275
9276 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9277 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9278 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9279 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9280 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9281 SDValue Ops[] = {
9282 Op.getOperand(0), // Chain
9283 Op.getOperand(2), // src
9284 Op.getOperand(3), // cmp
9285 Rsrc, // rsrc
9286 DAG.getConstant(0, DL, MVT::i32), // vindex
9287 VOffset, // voffset
9288 SOffset, // soffset
9289 Offset, // offset
9290 Op.getOperand(7), // cachepolicy
9291 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9292 };
9293 EVT VT = Op.getValueType();
9294 auto *M = cast<MemSDNode>(Op);
9295
9297 Op->getVTList(), Ops, VT,
9298 M->getMemOperand());
9299 }
9300 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9301 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9302 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9303 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9304 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9305 SDValue Ops[] = {
9306 Op.getOperand(0), // Chain
9307 Op.getOperand(2), // src
9308 Op.getOperand(3), // cmp
9309 Rsrc, // rsrc
9310 Op.getOperand(5), // vindex
9311 VOffset, // voffset
9312 SOffset, // soffset
9313 Offset, // offset
9314 Op.getOperand(8), // cachepolicy
9315 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9316 };
9317 EVT VT = Op.getValueType();
9318 auto *M = cast<MemSDNode>(Op);
9319
9321 Op->getVTList(), Ops, VT,
9322 M->getMemOperand());
9323 }
9324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9325 MemSDNode *M = cast<MemSDNode>(Op);
9326 SDValue NodePtr = M->getOperand(2);
9327 SDValue RayExtent = M->getOperand(3);
9328 SDValue RayOrigin = M->getOperand(4);
9329 SDValue RayDir = M->getOperand(5);
9330 SDValue RayInvDir = M->getOperand(6);
9331 SDValue TDescr = M->getOperand(7);
9332
9333 assert(NodePtr.getValueType() == MVT::i32 ||
9334 NodePtr.getValueType() == MVT::i64);
9335 assert(RayDir.getValueType() == MVT::v3f16 ||
9336 RayDir.getValueType() == MVT::v3f32);
9337
9338 if (!Subtarget->hasGFX10_AEncoding()) {
9339 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9340 return SDValue();
9341 }
9342
9343 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9344 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9345 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9346 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9347 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9348 const unsigned NumVDataDwords = 4;
9349 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9350 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9351 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9352 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9353 IsGFX12Plus;
9354 const unsigned BaseOpcodes[2][2] = {
9355 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9356 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9357 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9358 int Opcode;
9359 if (UseNSA) {
9360 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9361 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9362 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9363 : AMDGPU::MIMGEncGfx10NSA,
9364 NumVDataDwords, NumVAddrDwords);
9365 } else {
9366 assert(!IsGFX12Plus);
9367 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9368 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9369 : AMDGPU::MIMGEncGfx10Default,
9370 NumVDataDwords, NumVAddrDwords);
9371 }
9372 assert(Opcode != -1);
9373
9375
9376 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9378 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9379 if (Lanes[0].getValueSizeInBits() == 32) {
9380 for (unsigned I = 0; I < 3; ++I)
9381 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9382 } else {
9383 if (IsAligned) {
9384 Ops.push_back(DAG.getBitcast(
9385 MVT::i32,
9386 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9387 Ops.push_back(Lanes[2]);
9388 } else {
9389 SDValue Elt0 = Ops.pop_back_val();
9390 Ops.push_back(DAG.getBitcast(
9391 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9392 Ops.push_back(DAG.getBitcast(
9393 MVT::i32,
9394 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9395 }
9396 }
9397 };
9398
9399 if (UseNSA && IsGFX11Plus) {
9400 Ops.push_back(NodePtr);
9401 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9402 Ops.push_back(RayOrigin);
9403 if (IsA16) {
9404 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9405 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9406 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9407 for (unsigned I = 0; I < 3; ++I) {
9408 MergedLanes.push_back(DAG.getBitcast(
9409 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9410 {DirLanes[I], InvDirLanes[I]})));
9411 }
9412 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9413 } else {
9414 Ops.push_back(RayDir);
9415 Ops.push_back(RayInvDir);
9416 }
9417 } else {
9418 if (Is64)
9419 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9420 2);
9421 else
9422 Ops.push_back(NodePtr);
9423
9424 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9425 packLanes(RayOrigin, true);
9426 packLanes(RayDir, true);
9427 packLanes(RayInvDir, false);
9428 }
9429
9430 if (!UseNSA) {
9431 // Build a single vector containing all the operands so far prepared.
9432 if (NumVAddrDwords > 12) {
9433 SDValue Undef = DAG.getUNDEF(MVT::i32);
9434 Ops.append(16 - Ops.size(), Undef);
9435 }
9436 assert(Ops.size() >= 8 && Ops.size() <= 12);
9437 SDValue MergedOps =
9438 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9439 Ops.clear();
9440 Ops.push_back(MergedOps);
9441 }
9442
9443 Ops.push_back(TDescr);
9444 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9445 Ops.push_back(M->getChain());
9446
9447 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9448 MachineMemOperand *MemRef = M->getMemOperand();
9449 DAG.setNodeMemRefs(NewNode, {MemRef});
9450 return SDValue(NewNode, 0);
9451 }
9452 case Intrinsic::amdgcn_global_atomic_fmin_num:
9453 case Intrinsic::amdgcn_global_atomic_fmax_num:
9454 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9455 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9456 MemSDNode *M = cast<MemSDNode>(Op);
9457 SDValue Ops[] = {
9458 M->getOperand(0), // Chain
9459 M->getOperand(2), // Ptr
9460 M->getOperand(3) // Value
9461 };
9462 unsigned Opcode = 0;
9463 switch (IntrID) {
9464 case Intrinsic::amdgcn_global_atomic_fmin_num:
9465 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9466 Opcode = ISD::ATOMIC_LOAD_FMIN;
9467 break;
9468 }
9469 case Intrinsic::amdgcn_global_atomic_fmax_num:
9470 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9471 Opcode = ISD::ATOMIC_LOAD_FMAX;
9472 break;
9473 }
9474 default:
9475 llvm_unreachable("unhandled atomic opcode");
9476 }
9477 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9478 Ops, M->getMemOperand());
9479 }
9480 case Intrinsic::amdgcn_s_get_barrier_state:
9481 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9482 SDValue Chain = Op->getOperand(0);
9484 unsigned Opc;
9485
9486 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9487 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9488 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9489 BarID = (BarID >> 4) & 0x3F;
9490 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9491 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9492 Ops.push_back(K);
9493 Ops.push_back(Chain);
9494 } else {
9495 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9496 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9497 SDValue M0Val;
9498 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9499 DAG.getShiftAmountConstant(4, MVT::i32, DL));
9500 M0Val = SDValue(
9501 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9502 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9503 0);
9504 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9505 } else
9506 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9507 }
9508
9509 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9510 return SDValue(NewMI, 0);
9511 }
9512 default:
9513
9514 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9516 return lowerImage(Op, ImageDimIntr, DAG, true);
9517
9518 return SDValue();
9519 }
9520}
9521
9522// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9523// dwordx4 if on SI and handle TFE loads.
9524SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9525 SDVTList VTList,
9526 ArrayRef<SDValue> Ops, EVT MemVT,
9527 MachineMemOperand *MMO,
9528 SelectionDAG &DAG) const {
9529 LLVMContext &C = *DAG.getContext();
9531 EVT VT = VTList.VTs[0];
9532
9533 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9534 bool IsTFE = VTList.NumVTs == 3;
9535 if (IsTFE) {
9536 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9537 unsigned NumOpDWords = NumValueDWords + 1;
9538 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9539 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9540 MachineMemOperand *OpDWordsMMO =
9541 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9542 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9543 OpDWordsVT, OpDWordsMMO, DAG);
9545 DAG.getVectorIdxConstant(NumValueDWords, DL));
9546 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9547 SDValue ValueDWords =
9548 NumValueDWords == 1
9549 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9551 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9552 ZeroIdx);
9553 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9554 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9555 }
9556
9557 if (!Subtarget->hasDwordx3LoadStores() &&
9558 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9559 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9560 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9561 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9562 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9563 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9564 WidenedMemVT, WidenedMMO);
9566 DAG.getVectorIdxConstant(0, DL));
9567 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9568 }
9569
9570 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9571}
9572
9573SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9574 bool ImageStore) const {
9575 EVT StoreVT = VData.getValueType();
9576
9577 // No change for f16 and legal vector D16 types.
9578 if (!StoreVT.isVector())
9579 return VData;
9580
9581 SDLoc DL(VData);
9582 unsigned NumElements = StoreVT.getVectorNumElements();
9583
9584 if (Subtarget->hasUnpackedD16VMem()) {
9585 // We need to unpack the packed data to store.
9586 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9587 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9588
9589 EVT EquivStoreVT =
9590 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9591 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9592 return DAG.UnrollVectorOp(ZExt.getNode());
9593 }
9594
9595 // The sq block of gfx8.1 does not estimate register use correctly for d16
9596 // image store instructions. The data operand is computed as if it were not a
9597 // d16 image instruction.
9598 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9599 // Bitcast to i16
9600 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9601 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9602
9603 // Decompose into scalars
9605 DAG.ExtractVectorElements(IntVData, Elts);
9606
9607 // Group pairs of i16 into v2i16 and bitcast to i32
9608 SmallVector<SDValue, 4> PackedElts;
9609 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9610 SDValue Pair =
9611 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9612 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9613 PackedElts.push_back(IntPair);
9614 }
9615 if ((NumElements % 2) == 1) {
9616 // Handle v3i16
9617 unsigned I = Elts.size() / 2;
9618 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9619 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9620 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9621 PackedElts.push_back(IntPair);
9622 }
9623
9624 // Pad using UNDEF
9625 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9626
9627 // Build final vector
9628 EVT VecVT =
9629 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9630 return DAG.getBuildVector(VecVT, DL, PackedElts);
9631 }
9632
9633 if (NumElements == 3) {
9634 EVT IntStoreVT =
9636 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9637
9638 EVT WidenedStoreVT = EVT::getVectorVT(
9639 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9640 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9641 WidenedStoreVT.getStoreSizeInBits());
9642 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9643 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9644 }
9645
9646 assert(isTypeLegal(StoreVT));
9647 return VData;
9648}
9649
9650SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9651 SelectionDAG &DAG) const {
9652 SDLoc DL(Op);
9653 SDValue Chain = Op.getOperand(0);
9654 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9656
9657 switch (IntrinsicID) {
9658 case Intrinsic::amdgcn_exp_compr: {
9659 if (!Subtarget->hasCompressedExport()) {
9660 DiagnosticInfoUnsupported BadIntrin(
9662 "intrinsic not supported on subtarget", DL.getDebugLoc());
9663 DAG.getContext()->diagnose(BadIntrin);
9664 }
9665 SDValue Src0 = Op.getOperand(4);
9666 SDValue Src1 = Op.getOperand(5);
9667 // Hack around illegal type on SI by directly selecting it.
9668 if (isTypeLegal(Src0.getValueType()))
9669 return SDValue();
9670
9671 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9672 SDValue Undef = DAG.getUNDEF(MVT::f32);
9673 const SDValue Ops[] = {
9674 Op.getOperand(2), // tgt
9675 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9676 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9677 Undef, // src2
9678 Undef, // src3
9679 Op.getOperand(7), // vm
9680 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9681 Op.getOperand(3), // en
9682 Op.getOperand(0) // Chain
9683 };
9684
9685 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9686 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9687 }
9688 case Intrinsic::amdgcn_s_barrier:
9689 case Intrinsic::amdgcn_s_barrier_signal:
9690 case Intrinsic::amdgcn_s_barrier_wait: {
9693 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9694 if (WGSize <= ST.getWavefrontSize()) {
9695 // If the workgroup fits in a wave, remove s_barrier_signal and lower
9696 // s_barrier/s_barrier_wait to wave_barrier.
9697 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9698 return Op.getOperand(0);
9699 else
9700 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9701 MVT::Other, Op.getOperand(0)),
9702 0);
9703 }
9704 }
9705
9706 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9707 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9708 SDValue K =
9710 SDValue BarSignal =
9711 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9712 MVT::Other, K, Op.getOperand(0)),
9713 0);
9714 SDValue BarWait =
9715 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9716 BarSignal.getValue(0)),
9717 0);
9718 return BarWait;
9719 }
9720
9721 return SDValue();
9722 };
9723
9724 case Intrinsic::amdgcn_struct_tbuffer_store:
9725 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9726 SDValue VData = Op.getOperand(2);
9727 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9728 if (IsD16)
9729 VData = handleD16VData(VData, DAG);
9730 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9731 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9732 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9733 SDValue Ops[] = {
9734 Chain,
9735 VData, // vdata
9736 Rsrc, // rsrc
9737 Op.getOperand(4), // vindex
9738 VOffset, // voffset
9739 SOffset, // soffset
9740 Offset, // offset
9741 Op.getOperand(7), // format
9742 Op.getOperand(8), // cachepolicy, swizzled buffer
9743 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9744 };
9745 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9747 MemSDNode *M = cast<MemSDNode>(Op);
9748 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9749 M->getMemoryVT(), M->getMemOperand());
9750 }
9751
9752 case Intrinsic::amdgcn_raw_tbuffer_store:
9753 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9754 SDValue VData = Op.getOperand(2);
9755 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9756 if (IsD16)
9757 VData = handleD16VData(VData, DAG);
9758 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9759 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9760 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9761 SDValue Ops[] = {
9762 Chain,
9763 VData, // vdata
9764 Rsrc, // rsrc
9765 DAG.getConstant(0, DL, MVT::i32), // vindex
9766 VOffset, // voffset
9767 SOffset, // soffset
9768 Offset, // offset
9769 Op.getOperand(6), // format
9770 Op.getOperand(7), // cachepolicy, swizzled buffer
9771 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9772 };
9773 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9775 MemSDNode *M = cast<MemSDNode>(Op);
9776 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9777 M->getMemoryVT(), M->getMemOperand());
9778 }
9779
9780 case Intrinsic::amdgcn_raw_buffer_store:
9781 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9782 case Intrinsic::amdgcn_raw_buffer_store_format:
9783 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9784 const bool IsFormat =
9785 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9786 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9787
9788 SDValue VData = Op.getOperand(2);
9789 EVT VDataVT = VData.getValueType();
9790 EVT EltType = VDataVT.getScalarType();
9791 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9792 if (IsD16) {
9793 VData = handleD16VData(VData, DAG);
9794 VDataVT = VData.getValueType();
9795 }
9796
9797 if (!isTypeLegal(VDataVT)) {
9798 VData =
9799 DAG.getNode(ISD::BITCAST, DL,
9800 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9801 }
9802
9803 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9804 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9805 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9806 SDValue Ops[] = {
9807 Chain,
9808 VData,
9809 Rsrc,
9810 DAG.getConstant(0, DL, MVT::i32), // vindex
9811 VOffset, // voffset
9812 SOffset, // soffset
9813 Offset, // offset
9814 Op.getOperand(6), // cachepolicy, swizzled buffer
9815 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9816 };
9817 unsigned Opc =
9819 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9820 MemSDNode *M = cast<MemSDNode>(Op);
9821
9822 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9823 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9824 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9825
9826 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9827 M->getMemoryVT(), M->getMemOperand());
9828 }
9829
9830 case Intrinsic::amdgcn_struct_buffer_store:
9831 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9832 case Intrinsic::amdgcn_struct_buffer_store_format:
9833 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9834 const bool IsFormat =
9835 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9836 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9837
9838 SDValue VData = Op.getOperand(2);
9839 EVT VDataVT = VData.getValueType();
9840 EVT EltType = VDataVT.getScalarType();
9841 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9842
9843 if (IsD16) {
9844 VData = handleD16VData(VData, DAG);
9845 VDataVT = VData.getValueType();
9846 }
9847
9848 if (!isTypeLegal(VDataVT)) {
9849 VData =
9850 DAG.getNode(ISD::BITCAST, DL,
9851 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9852 }
9853
9854 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9855 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9856 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9857 SDValue Ops[] = {
9858 Chain,
9859 VData,
9860 Rsrc,
9861 Op.getOperand(4), // vindex
9862 VOffset, // voffset
9863 SOffset, // soffset
9864 Offset, // offset
9865 Op.getOperand(7), // cachepolicy, swizzled buffer
9866 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9867 };
9868 unsigned Opc =
9870 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9871 MemSDNode *M = cast<MemSDNode>(Op);
9872
9873 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9874 EVT VDataType = VData.getValueType().getScalarType();
9875 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9876 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9877
9878 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9879 M->getMemoryVT(), M->getMemOperand());
9880 }
9881 case Intrinsic::amdgcn_raw_buffer_load_lds:
9882 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9883 case Intrinsic::amdgcn_struct_buffer_load_lds:
9884 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9885 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9886 unsigned Opc;
9887 bool HasVIndex =
9888 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9889 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9890 unsigned OpOffset = HasVIndex ? 1 : 0;
9891 SDValue VOffset = Op.getOperand(5 + OpOffset);
9892 bool HasVOffset = !isNullConstant(VOffset);
9893 unsigned Size = Op->getConstantOperandVal(4);
9894
9895 switch (Size) {
9896 default:
9897 return SDValue();
9898 case 1:
9899 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9900 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9901 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9902 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9903 break;
9904 case 2:
9905 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9906 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9907 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9908 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9909 break;
9910 case 4:
9911 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9912 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9913 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9914 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9915 break;
9916 case 12:
9917 if (!Subtarget->hasLDSLoadB96_B128())
9918 return SDValue();
9919 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9920 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9921 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9922 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9923 break;
9924 case 16:
9925 if (!Subtarget->hasLDSLoadB96_B128())
9926 return SDValue();
9927 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9928 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9929 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9930 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9931 break;
9932 }
9933
9934 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9935
9937
9938 if (HasVIndex && HasVOffset)
9939 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9940 {Op.getOperand(5), // VIndex
9941 VOffset}));
9942 else if (HasVIndex)
9943 Ops.push_back(Op.getOperand(5));
9944 else if (HasVOffset)
9945 Ops.push_back(VOffset);
9946
9947 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9948 Ops.push_back(Rsrc);
9949 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9950 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9951 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9952 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9954 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
9955 DL, MVT::i8)); // cpol
9957 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
9958 ? 1
9959 : 0,
9960 DL, MVT::i8)); // swz
9961 Ops.push_back(M0Val.getValue(0)); // Chain
9962 Ops.push_back(M0Val.getValue(1)); // Glue
9963
9964 auto *M = cast<MemSDNode>(Op);
9965 MachineMemOperand *LoadMMO = M->getMemOperand();
9966 // Don't set the offset value here because the pointer points to the base of
9967 // the buffer.
9968 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9969
9970 MachinePointerInfo StorePtrI = LoadPtrI;
9971 LoadPtrI.V = PoisonValue::get(
9975
9976 auto F = LoadMMO->getFlags() &
9978 LoadMMO =
9980 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9981
9983 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9984 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9985
9986 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9987 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9988
9989 return SDValue(Load, 0);
9990 }
9991 case Intrinsic::amdgcn_global_load_lds: {
9992 unsigned Opc;
9993 unsigned Size = Op->getConstantOperandVal(4);
9994 switch (Size) {
9995 default:
9996 return SDValue();
9997 case 1:
9998 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9999 break;
10000 case 2:
10001 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10002 break;
10003 case 4:
10004 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10005 break;
10006 case 12:
10007 if (!Subtarget->hasLDSLoadB96_B128())
10008 return SDValue();
10009 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10010 break;
10011 case 16:
10012 if (!Subtarget->hasLDSLoadB96_B128())
10013 return SDValue();
10014 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10015 break;
10016 }
10017
10018 auto *M = cast<MemSDNode>(Op);
10019 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10020
10022
10023 SDValue Addr = Op.getOperand(2); // Global ptr
10024 SDValue VOffset;
10025 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10026 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10027 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10028 SDValue LHS = Addr.getOperand(0);
10029 SDValue RHS = Addr.getOperand(1);
10030
10031 if (LHS->isDivergent())
10032 std::swap(LHS, RHS);
10033
10034 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10035 RHS.getOperand(0).getValueType() == MVT::i32) {
10036 // add (i64 sgpr), (zero_extend (i32 vgpr))
10037 Addr = LHS;
10038 VOffset = RHS.getOperand(0);
10039 }
10040 }
10041
10042 Ops.push_back(Addr);
10043 if (!Addr->isDivergent()) {
10044 Opc = AMDGPU::getGlobalSaddrOp(Opc);
10045 if (!VOffset)
10046 VOffset =
10047 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10048 DAG.getTargetConstant(0, DL, MVT::i32)),
10049 0);
10050 Ops.push_back(VOffset);
10051 }
10052
10053 Ops.push_back(Op.getOperand(5)); // Offset
10054 Ops.push_back(Op.getOperand(6)); // CPol
10055 Ops.push_back(M0Val.getValue(0)); // Chain
10056 Ops.push_back(M0Val.getValue(1)); // Glue
10057
10058 MachineMemOperand *LoadMMO = M->getMemOperand();
10059 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10060 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10061 MachinePointerInfo StorePtrI = LoadPtrI;
10062 LoadPtrI.V = PoisonValue::get(
10066 auto F = LoadMMO->getFlags() &
10068 LoadMMO =
10070 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10072 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10073 LoadMMO->getAAInfo());
10074
10075 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10076 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10077
10078 return SDValue(Load, 0);
10079 }
10080 case Intrinsic::amdgcn_end_cf:
10081 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10082 Op->getOperand(2), Chain),
10083 0);
10084 case Intrinsic::amdgcn_s_barrier_init:
10085 case Intrinsic::amdgcn_s_barrier_signal_var: {
10086 // these two intrinsics have two operands: barrier pointer and member count
10087 SDValue Chain = Op->getOperand(0);
10089 SDValue BarOp = Op->getOperand(2);
10090 SDValue CntOp = Op->getOperand(3);
10091 SDValue M0Val;
10092 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10093 ? AMDGPU::S_BARRIER_INIT_M0
10094 : AMDGPU::S_BARRIER_SIGNAL_M0;
10095 // extract the BarrierID from bits 4-9 of BarOp
10096 SDValue BarID;
10097 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10098 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10099 BarID =
10100 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10101 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10102 0);
10103 // Member count should be put into M0[ShAmt:+6]
10104 // Barrier ID should be put into M0[5:0]
10105 M0Val =
10106 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10107 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10108 0);
10109 constexpr unsigned ShAmt = 16;
10110 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10111 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10112
10113 M0Val = SDValue(
10114 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10115
10116 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10117
10118 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10119 return SDValue(NewMI, 0);
10120 }
10121 case Intrinsic::amdgcn_s_barrier_join: {
10122 // these three intrinsics have one operand: barrier pointer
10123 SDValue Chain = Op->getOperand(0);
10125 SDValue BarOp = Op->getOperand(2);
10126 unsigned Opc;
10127
10128 if (isa<ConstantSDNode>(BarOp)) {
10129 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10130 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10131
10132 // extract the BarrierID from bits 4-9 of the immediate
10133 unsigned BarID = (BarVal >> 4) & 0x3F;
10134 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10135 Ops.push_back(K);
10136 Ops.push_back(Chain);
10137 } else {
10138 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10139
10140 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10141 SDValue M0Val;
10142 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10143 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10144 M0Val =
10145 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10146 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10147 0);
10148 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10149 }
10150
10151 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10152 return SDValue(NewMI, 0);
10153 }
10154 case Intrinsic::amdgcn_s_prefetch_data: {
10155 // For non-global address space preserve the chain and remove the call.
10156 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10157 return Op.getOperand(0);
10158 return Op;
10159 }
10160 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10161 SDValue Ops[] = {
10162 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10163 Op.getOperand(3), // offset
10164 Op.getOperand(4), // length
10165 };
10166
10167 MemSDNode *M = cast<MemSDNode>(Op);
10169 Op->getVTList(), Ops, M->getMemoryVT(),
10170 M->getMemOperand());
10171 }
10172 default: {
10173 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10175 return lowerImage(Op, ImageDimIntr, DAG, true);
10176
10177 return Op;
10178 }
10179 }
10180}
10181
10182// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10183// offset (the offset that is included in bounds checking and swizzling, to be
10184// split between the instruction's voffset and immoffset fields) and soffset
10185// (the offset that is excluded from bounds checking and swizzling, to go in
10186// the instruction's soffset field). This function takes the first kind of
10187// offset and figures out how to split it between voffset and immoffset.
10188std::pair<SDValue, SDValue>
10189SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10190 SDLoc DL(Offset);
10191 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10192 SDValue N0 = Offset;
10193 ConstantSDNode *C1 = nullptr;
10194
10195 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10196 N0 = SDValue();
10197 else if (DAG.isBaseWithConstantOffset(N0)) {
10198 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10199 N0 = N0.getOperand(0);
10200 }
10201
10202 if (C1) {
10203 unsigned ImmOffset = C1->getZExtValue();
10204 // If the immediate value is too big for the immoffset field, put only bits
10205 // that would normally fit in the immoffset field. The remaining value that
10206 // is copied/added for the voffset field is a large power of 2, and it
10207 // stands more chance of being CSEd with the copy/add for another similar
10208 // load/store.
10209 // However, do not do that rounding down if that is a negative
10210 // number, as it appears to be illegal to have a negative offset in the
10211 // vgpr, even if adding the immediate offset makes it positive.
10212 unsigned Overflow = ImmOffset & ~MaxImm;
10213 ImmOffset -= Overflow;
10214 if ((int32_t)Overflow < 0) {
10215 Overflow += ImmOffset;
10216 ImmOffset = 0;
10217 }
10218 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10219 if (Overflow) {
10220 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10221 if (!N0)
10222 N0 = OverflowVal;
10223 else {
10224 SDValue Ops[] = {N0, OverflowVal};
10225 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10226 }
10227 }
10228 }
10229 if (!N0)
10230 N0 = DAG.getConstant(0, DL, MVT::i32);
10231 if (!C1)
10232 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10233 return {N0, SDValue(C1, 0)};
10234}
10235
10236// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10237// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10238// pointed to by Offsets.
10239void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10240 SelectionDAG &DAG, SDValue *Offsets,
10241 Align Alignment) const {
10243 SDLoc DL(CombinedOffset);
10244 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10245 uint32_t Imm = C->getZExtValue();
10246 uint32_t SOffset, ImmOffset;
10247 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10248 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10249 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10250 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10251 return;
10252 }
10253 }
10254 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10255 SDValue N0 = CombinedOffset.getOperand(0);
10256 SDValue N1 = CombinedOffset.getOperand(1);
10257 uint32_t SOffset, ImmOffset;
10258 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10259 if (Offset >= 0 &&
10260 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10261 Offsets[0] = N0;
10262 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10263 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10264 return;
10265 }
10266 }
10267
10268 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10269 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10270 : DAG.getConstant(0, DL, MVT::i32);
10271
10272 Offsets[0] = CombinedOffset;
10273 Offsets[1] = SOffsetZero;
10274 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10275}
10276
10277SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10278 SelectionDAG &DAG) const {
10279 if (!MaybePointer.getValueType().isScalarInteger())
10280 return MaybePointer;
10281
10282 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10283 return Rsrc;
10284}
10285
10286// Wrap a global or flat pointer into a buffer intrinsic using the flags
10287// specified in the intrinsic.
10288SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10289 SelectionDAG &DAG) const {
10290 SDLoc Loc(Op);
10291
10292 SDValue Pointer = Op->getOperand(1);
10293 SDValue Stride = Op->getOperand(2);
10294 SDValue NumRecords = Op->getOperand(3);
10295 SDValue Flags = Op->getOperand(4);
10296
10297 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10298 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10299 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10300 std::optional<uint32_t> ConstStride = std::nullopt;
10301 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10302 ConstStride = ConstNode->getZExtValue();
10303
10304 SDValue NewHighHalf = Masked;
10305 if (!ConstStride || *ConstStride != 0) {
10306 SDValue ShiftedStride;
10307 if (ConstStride) {
10308 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10309 } else {
10310 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10311 ShiftedStride =
10312 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10313 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10314 }
10315 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10316 }
10317
10318 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10319 NewHighHalf, NumRecords, Flags);
10320 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10321 return RsrcPtr;
10322}
10323
10324// Handle 8 bit and 16 bit buffer loads
10325SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10326 EVT LoadVT, SDLoc DL,
10328 MachineMemOperand *MMO,
10329 bool IsTFE) const {
10330 EVT IntVT = LoadVT.changeTypeToInteger();
10331
10332 if (IsTFE) {
10333 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10337 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10338 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10339 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10341 DAG.getConstant(1, DL, MVT::i32));
10342 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10343 DAG.getConstant(0, DL, MVT::i32));
10344 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10345 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10346 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10347 }
10348
10349 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10352
10353 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10354 SDValue BufferLoad =
10355 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10356 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10357 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10358
10359 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10360}
10361
10362// Handle 8 bit and 16 bit buffer stores
10363SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10364 EVT VDataType, SDLoc DL,
10365 SDValue Ops[],
10366 MemSDNode *M) const {
10367 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10368 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10369
10370 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10371 Ops[1] = BufferStoreExt;
10372 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10374 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10375 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10376 M->getMemOperand());
10377}
10378
10380 SDValue Op, const SDLoc &SL, EVT VT) {
10381 if (VT.bitsLT(Op.getValueType()))
10382 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10383
10384 switch (ExtType) {
10385 case ISD::SEXTLOAD:
10386 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10387 case ISD::ZEXTLOAD:
10388 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10389 case ISD::EXTLOAD:
10390 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10391 case ISD::NON_EXTLOAD:
10392 return Op;
10393 }
10394
10395 llvm_unreachable("invalid ext type");
10396}
10397
10398// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10399// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10400SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10401 DAGCombinerInfo &DCI) const {
10402 SelectionDAG &DAG = DCI.DAG;
10403 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10404 return SDValue();
10405
10406 // FIXME: Constant loads should all be marked invariant.
10407 unsigned AS = Ld->getAddressSpace();
10408 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10410 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10411 return SDValue();
10412
10413 // Don't do this early, since it may interfere with adjacent load merging for
10414 // illegal types. We can avoid losing alignment information for exotic types
10415 // pre-legalize.
10416 EVT MemVT = Ld->getMemoryVT();
10417 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10418 MemVT.getSizeInBits() >= 32)
10419 return SDValue();
10420
10421 SDLoc SL(Ld);
10422
10423 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10424 "unexpected vector extload");
10425
10426 // TODO: Drop only high part of range.
10427 SDValue Ptr = Ld->getBasePtr();
10428 SDValue NewLoad = DAG.getLoad(
10429 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10430 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10431 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10432 nullptr); // Drop ranges
10433
10434 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10435 if (MemVT.isFloatingPoint()) {
10437 "unexpected fp extload");
10438 TruncVT = MemVT.changeTypeToInteger();
10439 }
10440
10441 SDValue Cvt = NewLoad;
10442 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10443 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10444 DAG.getValueType(TruncVT));
10445 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10447 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10448 } else {
10450 }
10451
10452 EVT VT = Ld->getValueType(0);
10453 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10454
10455 DCI.AddToWorklist(Cvt.getNode());
10456
10457 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10458 // the appropriate extension from the 32-bit load.
10459 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10460 DCI.AddToWorklist(Cvt.getNode());
10461
10462 // Handle conversion back to floating point if necessary.
10463 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10464
10465 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10466}
10467
10469 const SIMachineFunctionInfo &Info) {
10470 // TODO: Should check if the address can definitely not access stack.
10471 if (Info.isEntryFunction())
10472 return Info.getUserSGPRInfo().hasFlatScratchInit();
10473 return true;
10474}
10475
10476SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10477 SDLoc DL(Op);
10478 LoadSDNode *Load = cast<LoadSDNode>(Op);
10479 ISD::LoadExtType ExtType = Load->getExtensionType();
10480 EVT MemVT = Load->getMemoryVT();
10481 MachineMemOperand *MMO = Load->getMemOperand();
10482
10483 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10484 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10485 return SDValue();
10486
10487 // FIXME: Copied from PPC
10488 // First, load into 32 bits, then truncate to 1 bit.
10489
10490 SDValue Chain = Load->getChain();
10491 SDValue BasePtr = Load->getBasePtr();
10492
10493 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10494
10495 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10496 RealMemVT, MMO);
10497
10498 if (!MemVT.isVector()) {
10499 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10500 NewLD.getValue(1)};
10501
10502 return DAG.getMergeValues(Ops, DL);
10503 }
10504
10506 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10507 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10508 DAG.getConstant(I, DL, MVT::i32));
10509
10510 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10511 }
10512
10513 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10514
10515 return DAG.getMergeValues(Ops, DL);
10516 }
10517
10518 if (!MemVT.isVector())
10519 return SDValue();
10520
10521 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10522 "Custom lowering for non-i32 vectors hasn't been implemented.");
10523
10524 Align Alignment = Load->getAlign();
10525 unsigned AS = Load->getAddressSpace();
10526 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10527 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10528 return SplitVectorLoad(Op, DAG);
10529 }
10530
10533 // If there is a possibility that flat instruction access scratch memory
10534 // then we need to use the same legalization rules we use for private.
10535 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10537 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10540
10541 unsigned NumElements = MemVT.getVectorNumElements();
10542
10543 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10545 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10546 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10548 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10549 Alignment >= Align(4) && NumElements < 32) {
10550 if (MemVT.isPow2VectorType() ||
10551 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10552 return SDValue();
10553 return WidenOrSplitVectorLoad(Op, DAG);
10554 }
10555 // Non-uniform loads will be selected to MUBUF instructions, so they
10556 // have the same legalization requirements as global and private
10557 // loads.
10558 //
10559 }
10560 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10563 if (NumElements > 4)
10564 return SplitVectorLoad(Op, DAG);
10565 // v3 loads not supported on SI.
10566 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10567 return WidenOrSplitVectorLoad(Op, DAG);
10568
10569 // v3 and v4 loads are supported for private and global memory.
10570 return SDValue();
10571 }
10572 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10573 // Depending on the setting of the private_element_size field in the
10574 // resource descriptor, we can only make private accesses up to a certain
10575 // size.
10576 switch (Subtarget->getMaxPrivateElementSize()) {
10577 case 4: {
10578 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
10579 return DAG.getMergeValues({Op0, Op1}, DL);
10580 }
10581 case 8:
10582 if (NumElements > 2)
10583 return SplitVectorLoad(Op, DAG);
10584 return SDValue();
10585 case 16:
10586 // Same as global/flat
10587 if (NumElements > 4)
10588 return SplitVectorLoad(Op, DAG);
10589 // v3 loads not supported on SI.
10590 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10591 return WidenOrSplitVectorLoad(Op, DAG);
10592
10593 return SDValue();
10594 default:
10595 llvm_unreachable("unsupported private_element_size");
10596 }
10597 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10598 unsigned Fast = 0;
10599 auto Flags = Load->getMemOperand()->getFlags();
10601 Load->getAlign(), Flags, &Fast) &&
10602 Fast > 1)
10603 return SDValue();
10604
10605 if (MemVT.isVector())
10606 return SplitVectorLoad(Op, DAG);
10607 }
10608
10610 MemVT, *Load->getMemOperand())) {
10611 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
10612 return DAG.getMergeValues({Op0, Op1}, DL);
10613 }
10614
10615 return SDValue();
10616}
10617
10618SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10619 EVT VT = Op.getValueType();
10620 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10621 VT.getSizeInBits() == 512)
10622 return splitTernaryVectorOp(Op, DAG);
10623
10624 assert(VT.getSizeInBits() == 64);
10625
10626 SDLoc DL(Op);
10627 SDValue Cond = Op.getOperand(0);
10628
10629 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10630 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10631
10632 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10633 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10634
10635 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10636 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10637
10638 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10639
10640 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10641 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10642
10643 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10644
10645 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10646 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10647}
10648
10649// Catch division cases where we can use shortcuts with rcp and rsq
10650// instructions.
10651SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10652 SelectionDAG &DAG) const {
10653 SDLoc SL(Op);
10654 SDValue LHS = Op.getOperand(0);
10655 SDValue RHS = Op.getOperand(1);
10656 EVT VT = Op.getValueType();
10657 const SDNodeFlags Flags = Op->getFlags();
10658
10659 bool AllowInaccurateRcp =
10660 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10661
10662 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10663 // Without !fpmath accuracy information, we can't do more because we don't
10664 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10665 // f16 is always accurate enough
10666 if (!AllowInaccurateRcp && VT != MVT::f16)
10667 return SDValue();
10668
10669 if (CLHS->isExactlyValue(1.0)) {
10670 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10671 // the CI documentation has a worst case error of 1 ulp.
10672 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10673 // use it as long as we aren't trying to use denormals.
10674 //
10675 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10676
10677 // 1.0 / sqrt(x) -> rsq(x)
10678
10679 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10680 // error seems really high at 2^29 ULP.
10681 // 1.0 / x -> rcp(x)
10682 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10683 }
10684
10685 // Same as for 1.0, but expand the sign out of the constant.
10686 if (CLHS->isExactlyValue(-1.0)) {
10687 // -1.0 / x -> rcp (fneg x)
10688 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10689 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10690 }
10691 }
10692
10693 // For f16 require afn or arcp.
10694 // For f32 require afn.
10695 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10696 return SDValue();
10697
10698 // Turn into multiply by the reciprocal.
10699 // x / y -> x * (1.0 / y)
10700 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10701 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10702}
10703
10704SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10705 SelectionDAG &DAG) const {
10706 SDLoc SL(Op);
10707 SDValue X = Op.getOperand(0);
10708 SDValue Y = Op.getOperand(1);
10709 EVT VT = Op.getValueType();
10710 const SDNodeFlags Flags = Op->getFlags();
10711
10712 bool AllowInaccurateDiv =
10713 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10714 if (!AllowInaccurateDiv)
10715 return SDValue();
10716
10717 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10718 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10719
10720 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10721 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10722
10723 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10724 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10725 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10726 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10727 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10728 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10729}
10730
10731static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10732 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10733 SDNodeFlags Flags) {
10734 if (GlueChain->getNumValues() <= 1) {
10735 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10736 }
10737
10738 assert(GlueChain->getNumValues() == 3);
10739
10740 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10741 switch (Opcode) {
10742 default:
10743 llvm_unreachable("no chain equivalent for opcode");
10744 case ISD::FMUL:
10745 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10746 break;
10747 }
10748
10749 return DAG.getNode(Opcode, SL, VTList,
10750 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10751 Flags);
10752}
10753
10754static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10755 EVT VT, SDValue A, SDValue B, SDValue C,
10756 SDValue GlueChain, SDNodeFlags Flags) {
10757 if (GlueChain->getNumValues() <= 1) {
10758 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10759 }
10760
10761 assert(GlueChain->getNumValues() == 3);
10762
10763 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10764 switch (Opcode) {
10765 default:
10766 llvm_unreachable("no chain equivalent for opcode");
10767 case ISD::FMA:
10768 Opcode = AMDGPUISD::FMA_W_CHAIN;
10769 break;
10770 }
10771
10772 return DAG.getNode(Opcode, SL, VTList,
10773 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10774 Flags);
10775}
10776
10777SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10778 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10779 return FastLowered;
10780
10781 SDLoc SL(Op);
10782 SDValue LHS = Op.getOperand(0);
10783 SDValue RHS = Op.getOperand(1);
10784
10785 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10786 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10787 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10788 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10789 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10790 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10791 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10792 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10793 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10794 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10795 // q16.u = opx(V_CVT_F16_F32, q32.u);
10796 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10797
10798 // We will use ISD::FMA on targets that don't support ISD::FMAD.
10799 unsigned FMADOpCode =
10801
10802 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10803 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10804 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10805 SDValue Rcp =
10806 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10807 SDValue Quot =
10808 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10809 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10810 Op->getFlags());
10811 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10812 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10813 Op->getFlags());
10814 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10815 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10816 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10817 DAG.getConstant(0xff800000, SL, MVT::i32));
10818 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10819 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10820 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10821 DAG.getTargetConstant(0, SL, MVT::i32));
10822 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10823 Op->getFlags());
10824}
10825
10826// Faster 2.5 ULP division that does not support denormals.
10827SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10828 SDNodeFlags Flags = Op->getFlags();
10829 SDLoc SL(Op);
10830 SDValue LHS = Op.getOperand(1);
10831 SDValue RHS = Op.getOperand(2);
10832
10833 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10834
10835 const APFloat K0Val(0x1p+96f);
10836 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10837
10838 const APFloat K1Val(0x1p-32f);
10839 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10840
10841 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10842
10843 EVT SetCCVT =
10844 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10845
10846 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10847
10848 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10849
10850 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10851
10852 // rcp does not support denormals.
10853 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10854
10855 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10856
10857 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10858}
10859
10860// Returns immediate value for setting the F32 denorm mode when using the
10861// S_DENORM_MODE instruction.
10863 const SIMachineFunctionInfo *Info,
10864 const GCNSubtarget *ST) {
10865 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10866 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10867 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10868 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10869}
10870
10871SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10872 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10873 return FastLowered;
10874
10875 // The selection matcher assumes anything with a chain selecting to a
10876 // mayRaiseFPException machine instruction. Since we're introducing a chain
10877 // here, we need to explicitly report nofpexcept for the regular fdiv
10878 // lowering.
10879 SDNodeFlags Flags = Op->getFlags();
10880 Flags.setNoFPExcept(true);
10881
10882 SDLoc SL(Op);
10883 SDValue LHS = Op.getOperand(0);
10884 SDValue RHS = Op.getOperand(1);
10885
10886 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10887
10888 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10889
10890 SDValue DenominatorScaled =
10891 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
10892 SDValue NumeratorScaled =
10893 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
10894
10895 // Denominator is scaled to not be denormal, so using rcp is ok.
10896 SDValue ApproxRcp =
10897 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
10898 SDValue NegDivScale0 =
10899 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
10900
10901 using namespace AMDGPU::Hwreg;
10902 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10903 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10904
10905 const MachineFunction &MF = DAG.getMachineFunction();
10907 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10908
10909 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10910 const bool HasDynamicDenormals =
10911 (DenormMode.Input == DenormalMode::Dynamic) ||
10912 (DenormMode.Output == DenormalMode::Dynamic);
10913
10914 SDValue SavedDenormMode;
10915
10916 if (!PreservesDenormals) {
10917 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10918 // lowering. The chain dependence is insufficient, and we need glue. We do
10919 // not need the glue variants in a strictfp function.
10920
10921 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10922
10923 SDValue Glue = DAG.getEntryNode();
10924 if (HasDynamicDenormals) {
10925 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10926 DAG.getVTList(MVT::i32, MVT::Glue),
10927 {BitField, Glue});
10928 SavedDenormMode = SDValue(GetReg, 0);
10929
10930 Glue = DAG.getMergeValues(
10931 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10932 }
10933
10934 SDNode *EnableDenorm;
10935 if (Subtarget->hasDenormModeInst()) {
10936 const SDValue EnableDenormValue =
10937 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10938
10939 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10940 EnableDenormValue)
10941 .getNode();
10942 } else {
10943 const SDValue EnableDenormValue =
10944 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
10945 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10946 {EnableDenormValue, BitField, Glue});
10947 }
10948
10949 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
10950 SDValue(EnableDenorm, 1)};
10951
10952 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10953 }
10954
10955 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10956 ApproxRcp, One, NegDivScale0, Flags);
10957
10958 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10959 ApproxRcp, Fma0, Flags);
10960
10961 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
10962 Fma1, Flags);
10963
10964 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10965 NumeratorScaled, Mul, Flags);
10966
10967 SDValue Fma3 =
10968 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
10969
10970 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10971 NumeratorScaled, Fma3, Flags);
10972
10973 if (!PreservesDenormals) {
10974 SDNode *DisableDenorm;
10975 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10976 const SDValue DisableDenormValue = getSPDenormModeValue(
10977 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10978
10979 DisableDenorm =
10980 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1),
10981 DisableDenormValue, Fma4.getValue(2))
10982 .getNode();
10983 } else {
10984 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10985 const SDValue DisableDenormValue =
10986 HasDynamicDenormals
10987 ? SavedDenormMode
10988 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10989
10990 DisableDenorm = DAG.getMachineNode(
10991 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10992 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10993 }
10994
10995 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10996 SDValue(DisableDenorm, 0), DAG.getRoot());
10997 DAG.setRoot(OutputChain);
10998 }
10999
11000 SDValue Scale = NumeratorScaled.getValue(1);
11001 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11002 {Fma4, Fma1, Fma3, Scale}, Flags);
11003
11004 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11005}
11006
11007SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11008 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11009 return FastLowered;
11010
11011 SDLoc SL(Op);
11012 SDValue X = Op.getOperand(0);
11013 SDValue Y = Op.getOperand(1);
11014
11015 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11016
11017 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11018
11019 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11020
11021 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11022
11023 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11024
11025 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11026
11027 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11028
11029 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11030
11031 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11032
11033 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11034 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11035
11036 SDValue Fma4 =
11037 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11038
11039 SDValue Scale;
11040
11041 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11042 // Workaround a hardware bug on SI where the condition output from div_scale
11043 // is not usable.
11044
11045 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11046
11047 // Figure out if the scale to use for div_fmas.
11048 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11049 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11050 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11051 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11052
11053 SDValue NumHi =
11054 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11055 SDValue DenHi =
11056 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11057
11058 SDValue Scale0Hi =
11059 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11060 SDValue Scale1Hi =
11061 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11062
11063 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11064 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11065 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11066 } else {
11067 Scale = DivScale1.getValue(1);
11068 }
11069
11070 SDValue Fmas =
11071 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11072
11073 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11074}
11075
11076SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11077 EVT VT = Op.getValueType();
11078
11079 if (VT == MVT::f32)
11080 return LowerFDIV32(Op, DAG);
11081
11082 if (VT == MVT::f64)
11083 return LowerFDIV64(Op, DAG);
11084
11085 if (VT == MVT::f16)
11086 return LowerFDIV16(Op, DAG);
11087
11088 llvm_unreachable("Unexpected type for fdiv");
11089}
11090
11091SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11092 SDLoc dl(Op);
11093 SDValue Val = Op.getOperand(0);
11094 EVT VT = Val.getValueType();
11095 EVT ResultExpVT = Op->getValueType(1);
11096 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11097
11098 SDValue Mant = DAG.getNode(
11100 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11101
11102 SDValue Exp = DAG.getNode(
11103 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11104 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11105
11106 if (Subtarget->hasFractBug()) {
11107 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11108 SDValue Inf =
11110
11111 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11112 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11113 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11114 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11115 }
11116
11117 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11118 return DAG.getMergeValues({Mant, CastExp}, dl);
11119}
11120
11121SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11122 SDLoc DL(Op);
11123 StoreSDNode *Store = cast<StoreSDNode>(Op);
11124 EVT VT = Store->getMemoryVT();
11125
11126 if (VT == MVT::i1) {
11127 return DAG.getTruncStore(
11128 Store->getChain(), DL,
11129 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11130 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11131 }
11132
11133 assert(VT.isVector() &&
11134 Store->getValue().getValueType().getScalarType() == MVT::i32);
11135
11136 unsigned AS = Store->getAddressSpace();
11137 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11138 Store->getAlign().value() < VT.getStoreSize() &&
11139 VT.getSizeInBits() > 32) {
11140 return SplitVectorStore(Op, DAG);
11141 }
11142
11145 // If there is a possibility that flat instruction access scratch memory
11146 // then we need to use the same legalization rules we use for private.
11147 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11149 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11152
11153 unsigned NumElements = VT.getVectorNumElements();
11155 if (NumElements > 4)
11156 return SplitVectorStore(Op, DAG);
11157 // v3 stores not supported on SI.
11158 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11159 return SplitVectorStore(Op, DAG);
11160
11162 VT, *Store->getMemOperand()))
11163 return expandUnalignedStore(Store, DAG);
11164
11165 return SDValue();
11166 }
11167 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11168 switch (Subtarget->getMaxPrivateElementSize()) {
11169 case 4:
11170 return scalarizeVectorStore(Store, DAG);
11171 case 8:
11172 if (NumElements > 2)
11173 return SplitVectorStore(Op, DAG);
11174 return SDValue();
11175 case 16:
11176 if (NumElements > 4 ||
11177 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11178 return SplitVectorStore(Op, DAG);
11179 return SDValue();
11180 default:
11181 llvm_unreachable("unsupported private_element_size");
11182 }
11183 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11184 unsigned Fast = 0;
11185 auto Flags = Store->getMemOperand()->getFlags();
11187 Store->getAlign(), Flags, &Fast) &&
11188 Fast > 1)
11189 return SDValue();
11190
11191 if (VT.isVector())
11192 return SplitVectorStore(Op, DAG);
11193
11194 return expandUnalignedStore(Store, DAG);
11195 }
11196
11197 // Probably an invalid store. If so we'll end up emitting a selection error.
11198 return SDValue();
11199}
11200
11201// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11202SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11203 SDLoc SL(Op);
11204 assert(!Subtarget->has16BitInsts());
11205 SDNodeFlags Flags = Op->getFlags();
11206 SDValue Ext =
11207 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11208
11209 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11210 SDValue Sqrt =
11211 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11212
11213 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11214 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11215}
11216
11217SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11218 SDLoc DL(Op);
11219 SDNodeFlags Flags = Op->getFlags();
11220 MVT VT = Op.getValueType().getSimpleVT();
11221 const SDValue X = Op.getOperand(0);
11222
11223 if (allowApproxFunc(DAG, Flags)) {
11224 // Instruction is 1ulp but ignores denormals.
11225 return DAG.getNode(
11227 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11228 }
11229
11230 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11231 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11232
11233 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11234
11235 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11236
11237 SDValue SqrtX =
11238 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11239
11240 SDValue SqrtS;
11241 if (needsDenormHandlingF32(DAG, X, Flags)) {
11242 SDValue SqrtID =
11243 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11244 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11245
11246 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11247 SDValue SqrtSNextDownInt =
11248 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11249 DAG.getAllOnesConstant(DL, MVT::i32));
11250 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11251
11252 SDValue NegSqrtSNextDown =
11253 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11254
11255 SDValue SqrtVP =
11256 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11257
11258 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11259 DAG.getConstant(1, DL, MVT::i32));
11260 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11261
11262 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11263 SDValue SqrtVS =
11264 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11265
11266 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11267 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11268
11269 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11270 Flags);
11271
11272 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11273 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11274 Flags);
11275 } else {
11276 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11277
11278 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11279
11280 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11281 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11282 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11283
11284 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11285 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11286 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11287
11288 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11289 SDValue SqrtD =
11290 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11291 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11292 }
11293
11294 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11295
11296 SDValue ScaledDown =
11297 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11298
11299 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11300 SDValue IsZeroOrInf =
11301 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11302 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11303
11304 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11305}
11306
11307SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11308 // For double type, the SQRT and RSQ instructions don't have required
11309 // precision, we apply Goldschmidt's algorithm to improve the result:
11310 //
11311 // y0 = rsq(x)
11312 // g0 = x * y0
11313 // h0 = 0.5 * y0
11314 //
11315 // r0 = 0.5 - h0 * g0
11316 // g1 = g0 * r0 + g0
11317 // h1 = h0 * r0 + h0
11318 //
11319 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11320 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11321 // h2 = h1 * r1 + h1
11322 //
11323 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11324 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11325 //
11326 // sqrt(x) = g3
11327
11328 SDNodeFlags Flags = Op->getFlags();
11329
11330 SDLoc DL(Op);
11331
11332 SDValue X = Op.getOperand(0);
11333 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11334
11335 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11336
11337 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11338
11339 // Scale up input if it is too small.
11340 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11341 SDValue ScaleUp =
11342 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11343 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11344
11345 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11346
11347 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11348
11349 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11350 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11351
11352 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11353 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11354
11355 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11356
11357 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11358
11359 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11360 SDValue SqrtD0 =
11361 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11362
11363 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11364
11365 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11366 SDValue SqrtD1 =
11367 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11368
11369 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11370
11371 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11372 SDValue ScaleDown =
11373 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11374 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11375
11376 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11377 // with finite only or nsz because rsq(+/-0) = +/-inf
11378
11379 // TODO: Check for DAZ and expand to subnormals
11380 SDValue IsZeroOrInf =
11381 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11382 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11383
11384 // If x is +INF, +0, or -0, use its original value
11385 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11386 Flags);
11387}
11388
11389SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11390 SDLoc DL(Op);
11391 EVT VT = Op.getValueType();
11392 SDValue Arg = Op.getOperand(0);
11393 SDValue TrigVal;
11394
11395 // Propagate fast-math flags so that the multiply we introduce can be folded
11396 // if Arg is already the result of a multiply by constant.
11397 auto Flags = Op->getFlags();
11398
11399 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11400
11401 if (Subtarget->hasTrigReducedRange()) {
11402 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11403 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11404 } else {
11405 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11406 }
11407
11408 switch (Op.getOpcode()) {
11409 case ISD::FCOS:
11410 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11411 case ISD::FSIN:
11412 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11413 default:
11414 llvm_unreachable("Wrong trig opcode");
11415 }
11416}
11417
11418SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11419 SelectionDAG &DAG) const {
11420 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11421 assert(AtomicNode->isCompareAndSwap());
11422 unsigned AS = AtomicNode->getAddressSpace();
11423
11424 // No custom lowering required for local address space
11426 return Op;
11427
11428 // Non-local address space requires custom lowering for atomic compare
11429 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11430 SDLoc DL(Op);
11431 SDValue ChainIn = Op.getOperand(0);
11432 SDValue Addr = Op.getOperand(1);
11433 SDValue Old = Op.getOperand(2);
11434 SDValue New = Op.getOperand(3);
11435 EVT VT = Op.getValueType();
11436 MVT SimpleVT = VT.getSimpleVT();
11437 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11438
11439 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11440 SDValue Ops[] = {ChainIn, Addr, NewOld};
11441
11443 Op->getVTList(), Ops, VT,
11444 AtomicNode->getMemOperand());
11445}
11446
11447//===----------------------------------------------------------------------===//
11448// Custom DAG optimizations
11449//===----------------------------------------------------------------------===//
11450
11451SDValue
11452SITargetLowering::performUCharToFloatCombine(SDNode *N,
11453 DAGCombinerInfo &DCI) const {
11454 EVT VT = N->getValueType(0);
11455 EVT ScalarVT = VT.getScalarType();
11456 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11457 return SDValue();
11458
11459 SelectionDAG &DAG = DCI.DAG;
11460 SDLoc DL(N);
11461
11462 SDValue Src = N->getOperand(0);
11463 EVT SrcVT = Src.getValueType();
11464
11465 // TODO: We could try to match extracting the higher bytes, which would be
11466 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11467 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11468 // about in practice.
11469 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11470 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11471 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11472 DCI.AddToWorklist(Cvt.getNode());
11473
11474 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11475 if (ScalarVT != MVT::f32) {
11476 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11477 DAG.getTargetConstant(0, DL, MVT::i32));
11478 }
11479 return Cvt;
11480 }
11481 }
11482
11483 return SDValue();
11484}
11485
11486SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11487 DAGCombinerInfo &DCI) const {
11488 SDValue MagnitudeOp = N->getOperand(0);
11489 SDValue SignOp = N->getOperand(1);
11490 SelectionDAG &DAG = DCI.DAG;
11491 SDLoc DL(N);
11492
11493 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11494 // lower half with a copy.
11495 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11496 if (MagnitudeOp.getValueType() == MVT::f64) {
11497 SDValue MagAsVector =
11498 DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11499 SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11500 MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11501 SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11502 MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11503
11504 SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11505
11506 SDValue Vector =
11507 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11508
11509 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11510 }
11511
11512 if (SignOp.getValueType() != MVT::f64)
11513 return SDValue();
11514
11515 // Reduce width of sign operand, we only need the highest bit.
11516 //
11517 // fcopysign f64:x, f64:y ->
11518 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11519 // TODO: In some cases it might make sense to go all the way to f16.
11520 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11521 SDValue SignAsF32 =
11522 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11523 DAG.getConstant(1, DL, MVT::i32));
11524
11525 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11526 SignAsF32);
11527}
11528
11529// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11530// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11531// bits
11532
11533// This is a variant of
11534// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11535//
11536// The normal DAG combiner will do this, but only if the add has one use since
11537// that would increase the number of instructions.
11538//
11539// This prevents us from seeing a constant offset that can be folded into a
11540// memory instruction's addressing mode. If we know the resulting add offset of
11541// a pointer can be folded into an addressing offset, we can replace the pointer
11542// operand with the add of new constant offset. This eliminates one of the uses,
11543// and may allow the remaining use to also be simplified.
11544//
11545SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
11546 EVT MemVT,
11547 DAGCombinerInfo &DCI) const {
11548 SDValue N0 = N->getOperand(0);
11549 SDValue N1 = N->getOperand(1);
11550
11551 // We only do this to handle cases where it's profitable when there are
11552 // multiple uses of the add, so defer to the standard combine.
11553 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11554 N0->hasOneUse())
11555 return SDValue();
11556
11557 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11558 if (!CN1)
11559 return SDValue();
11560
11561 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11562 if (!CAdd)
11563 return SDValue();
11564
11565 SelectionDAG &DAG = DCI.DAG;
11566
11567 if (N0->getOpcode() == ISD::OR &&
11568 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11569 return SDValue();
11570
11571 // If the resulting offset is too large, we can't fold it into the
11572 // addressing mode offset.
11573 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11574 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11575
11576 AddrMode AM;
11577 AM.HasBaseReg = true;
11578 AM.BaseOffs = Offset.getSExtValue();
11579 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11580 return SDValue();
11581
11582 SDLoc SL(N);
11583 EVT VT = N->getValueType(0);
11584
11585 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11586 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11587
11589 Flags.setNoUnsignedWrap(
11590 N->getFlags().hasNoUnsignedWrap() &&
11591 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
11592
11593 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11594}
11595
11596/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11597/// by the chain and intrinsic ID. Theoretically we would also need to check the
11598/// specific intrinsic, but they all place the pointer operand first.
11599static unsigned getBasePtrIndex(const MemSDNode *N) {
11600 switch (N->getOpcode()) {
11601 case ISD::STORE:
11604 return 2;
11605 default:
11606 return 1;
11607 }
11608}
11609
11610SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11611 DAGCombinerInfo &DCI) const {
11612 SelectionDAG &DAG = DCI.DAG;
11613 SDLoc SL(N);
11614
11615 unsigned PtrIdx = getBasePtrIndex(N);
11616 SDValue Ptr = N->getOperand(PtrIdx);
11617
11618 // TODO: We could also do this for multiplies.
11619 if (Ptr.getOpcode() == ISD::SHL) {
11620 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11621 N->getMemoryVT(), DCI);
11622 if (NewPtr) {
11623 SmallVector<SDValue, 8> NewOps(N->ops());
11624
11625 NewOps[PtrIdx] = NewPtr;
11626 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11627 }
11628 }
11629
11630 return SDValue();
11631}
11632
11633static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11634 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11635 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11636 (Opc == ISD::XOR && Val == 0);
11637}
11638
11639// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11640// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11641// integer combine opportunities since most 64-bit operations are decomposed
11642// this way. TODO: We won't want this for SALU especially if it is an inline
11643// immediate.
11644SDValue SITargetLowering::splitBinaryBitConstantOp(
11645 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
11646 const ConstantSDNode *CRHS) const {
11647 uint64_t Val = CRHS->getZExtValue();
11648 uint32_t ValLo = Lo_32(Val);
11649 uint32_t ValHi = Hi_32(Val);
11651
11652 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11653 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11654 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11655 // If we need to materialize a 64-bit immediate, it will be split up later
11656 // anyway. Avoid creating the harder to understand 64-bit immediate
11657 // materialization.
11658 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11659 }
11660
11661 return SDValue();
11662}
11663
11665 if (V.getValueType() != MVT::i1)
11666 return false;
11667 switch (V.getOpcode()) {
11668 default:
11669 break;
11670 case ISD::SETCC:
11672 return true;
11673 case ISD::AND:
11674 case ISD::OR:
11675 case ISD::XOR:
11676 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11677 }
11678 return false;
11679}
11680
11681// If a constant has all zeroes or all ones within each byte return it.
11682// Otherwise return 0.
11684 // 0xff for any zero byte in the mask
11685 uint32_t ZeroByteMask = 0;
11686 if (!(C & 0x000000ff))
11687 ZeroByteMask |= 0x000000ff;
11688 if (!(C & 0x0000ff00))
11689 ZeroByteMask |= 0x0000ff00;
11690 if (!(C & 0x00ff0000))
11691 ZeroByteMask |= 0x00ff0000;
11692 if (!(C & 0xff000000))
11693 ZeroByteMask |= 0xff000000;
11694 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11695 if ((NonZeroByteMask & C) != NonZeroByteMask)
11696 return 0; // Partial bytes selected.
11697 return C;
11698}
11699
11700// Check if a node selects whole bytes from its operand 0 starting at a byte
11701// boundary while masking the rest. Returns select mask as in the v_perm_b32
11702// or -1 if not succeeded.
11703// Note byte select encoding:
11704// value 0-3 selects corresponding source byte;
11705// value 0xc selects zero;
11706// value 0xff selects 0xff.
11708 assert(V.getValueSizeInBits() == 32);
11709
11710 if (V.getNumOperands() != 2)
11711 return ~0;
11712
11713 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11714 if (!N1)
11715 return ~0;
11716
11717 uint32_t C = N1->getZExtValue();
11718
11719 switch (V.getOpcode()) {
11720 default:
11721 break;
11722 case ISD::AND:
11723 if (uint32_t ConstMask = getConstantPermuteMask(C))
11724 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11725 break;
11726
11727 case ISD::OR:
11728 if (uint32_t ConstMask = getConstantPermuteMask(C))
11729 return (0x03020100 & ~ConstMask) | ConstMask;
11730 break;
11731
11732 case ISD::SHL:
11733 if (C % 8)
11734 return ~0;
11735
11736 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11737
11738 case ISD::SRL:
11739 if (C % 8)
11740 return ~0;
11741
11742 return uint32_t(0x0c0c0c0c03020100ull >> C);
11743 }
11744
11745 return ~0;
11746}
11747
11748SDValue SITargetLowering::performAndCombine(SDNode *N,
11749 DAGCombinerInfo &DCI) const {
11750 if (DCI.isBeforeLegalize())
11751 return SDValue();
11752
11753 SelectionDAG &DAG = DCI.DAG;
11754 EVT VT = N->getValueType(0);
11755 SDValue LHS = N->getOperand(0);
11756 SDValue RHS = N->getOperand(1);
11757
11758 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11759 if (VT == MVT::i64 && CRHS) {
11760 if (SDValue Split =
11761 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11762 return Split;
11763 }
11764
11765 if (CRHS && VT == MVT::i32) {
11766 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11767 // nb = number of trailing zeroes in mask
11768 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11769 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11770 uint64_t Mask = CRHS->getZExtValue();
11771 unsigned Bits = llvm::popcount(Mask);
11772 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11773 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11774 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11775 unsigned Shift = CShift->getZExtValue();
11776 unsigned NB = CRHS->getAPIntValue().countr_zero();
11777 unsigned Offset = NB + Shift;
11778 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11779 SDLoc SL(N);
11780 SDValue BFE =
11781 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
11782 DAG.getConstant(Offset, SL, MVT::i32),
11783 DAG.getConstant(Bits, SL, MVT::i32));
11784 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11785 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11786 DAG.getValueType(NarrowVT));
11787 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11788 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11789 return Shl;
11790 }
11791 }
11792 }
11793
11794 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11795 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11796 isa<ConstantSDNode>(LHS.getOperand(2))) {
11797 uint32_t Sel = getConstantPermuteMask(Mask);
11798 if (!Sel)
11799 return SDValue();
11800
11801 // Select 0xc for all zero bytes
11802 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11803 SDLoc DL(N);
11804 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11805 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11806 }
11807 }
11808
11809 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11810 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11811 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11812 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11813 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11814
11815 SDValue X = LHS.getOperand(0);
11816 SDValue Y = RHS.getOperand(0);
11817 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11818 !isTypeLegal(X.getValueType()))
11819 return SDValue();
11820
11821 if (LCC == ISD::SETO) {
11822 if (X != LHS.getOperand(1))
11823 return SDValue();
11824
11825 if (RCC == ISD::SETUNE) {
11826 const ConstantFPSDNode *C1 =
11827 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11828 if (!C1 || !C1->isInfinity() || C1->isNegative())
11829 return SDValue();
11830
11835
11836 static_assert(
11839 0x3ff) == Mask,
11840 "mask not equal");
11841
11842 SDLoc DL(N);
11843 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
11844 DAG.getConstant(Mask, DL, MVT::i32));
11845 }
11846 }
11847 }
11848
11849 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11850 std::swap(LHS, RHS);
11851
11852 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11853 RHS.hasOneUse()) {
11854 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11855 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11856 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11857 // | n_nan)
11858 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11859 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11860 (RHS.getOperand(0) == LHS.getOperand(0) &&
11861 LHS.getOperand(0) == LHS.getOperand(1))) {
11862 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11863 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
11864 : Mask->getZExtValue() & OrdMask;
11865
11866 SDLoc DL(N);
11867 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11868 DAG.getConstant(NewMask, DL, MVT::i32));
11869 }
11870 }
11871
11872 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
11873 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11874 // and x, (sext cc from i1) => select cc, x, 0
11875 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11876 std::swap(LHS, RHS);
11877 if (isBoolSGPR(RHS.getOperand(0)))
11878 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
11879 DAG.getConstant(0, SDLoc(N), MVT::i32));
11880 }
11881
11882 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11884 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11885 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11886 uint32_t LHSMask = getPermuteMask(LHS);
11887 uint32_t RHSMask = getPermuteMask(RHS);
11888 if (LHSMask != ~0u && RHSMask != ~0u) {
11889 // Canonicalize the expression in an attempt to have fewer unique masks
11890 // and therefore fewer registers used to hold the masks.
11891 if (LHSMask > RHSMask) {
11892 std::swap(LHSMask, RHSMask);
11893 std::swap(LHS, RHS);
11894 }
11895
11896 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11897 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11898 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11899 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11900
11901 // Check of we need to combine values from two sources within a byte.
11902 if (!(LHSUsedLanes & RHSUsedLanes) &&
11903 // If we select high and lower word keep it for SDWA.
11904 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11905 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11906 // Each byte in each mask is either selector mask 0-3, or has higher
11907 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11908 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11909 // mask which is not 0xff wins. By anding both masks we have a correct
11910 // result except that 0x0c shall be corrected to give 0x0c only.
11911 uint32_t Mask = LHSMask & RHSMask;
11912 for (unsigned I = 0; I < 32; I += 8) {
11913 uint32_t ByteSel = 0xff << I;
11914 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11915 Mask &= (0x0c << I) & 0xffffffff;
11916 }
11917
11918 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11919 // or 0x0c.
11920 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11921 SDLoc DL(N);
11922
11923 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11924 RHS.getOperand(0),
11925 DAG.getConstant(Sel, DL, MVT::i32));
11926 }
11927 }
11928 }
11929
11930 return SDValue();
11931}
11932
11933// A key component of v_perm is a mapping between byte position of the src
11934// operands, and the byte position of the dest. To provide such, we need: 1. the
11935// node that provides x byte of the dest of the OR, and 2. the byte of the node
11936// used to provide that x byte. calculateByteProvider finds which node provides
11937// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11938// and finds an ultimate src and byte position For example: The supported
11939// LoadCombine pattern for vector loads is as follows
11940// t1
11941// or
11942// / \
11943// t2 t3
11944// zext shl
11945// | | \
11946// t4 t5 16
11947// or anyext
11948// / \ |
11949// t6 t7 t8
11950// srl shl or
11951// / | / \ / \
11952// t9 t10 t11 t12 t13 t14
11953// trunc* 8 trunc* 8 and and
11954// | | / | | \
11955// t15 t16 t17 t18 t19 t20
11956// trunc* 255 srl -256
11957// | / \
11958// t15 t15 16
11959//
11960// *In this example, the truncs are from i32->i16
11961//
11962// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11963// respectively. calculateSrcByte would find (given node) -> ultimate src &
11964// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11965// After finding the mapping, we can combine the tree into vperm t15, t16,
11966// 0x05000407
11967
11968// Find the source and byte position from a node.
11969// \p DestByte is the byte position of the dest of the or that the src
11970// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11971// dest of the or byte. \p Depth tracks how many recursive iterations we have
11972// performed.
11973static const std::optional<ByteProvider<SDValue>>
11974calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11975 unsigned Depth = 0) {
11976 // We may need to recursively traverse a series of SRLs
11977 if (Depth >= 6)
11978 return std::nullopt;
11979
11980 if (Op.getValueSizeInBits() < 8)
11981 return std::nullopt;
11982
11983 if (Op.getValueType().isVector())
11984 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11985
11986 switch (Op->getOpcode()) {
11987 case ISD::TRUNCATE: {
11988 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11989 }
11990
11991 case ISD::SIGN_EXTEND:
11992 case ISD::ZERO_EXTEND:
11994 SDValue NarrowOp = Op->getOperand(0);
11995 auto NarrowVT = NarrowOp.getValueType();
11996 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11997 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11998 NarrowVT = VTSign->getVT();
11999 }
12000 if (!NarrowVT.isByteSized())
12001 return std::nullopt;
12002 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12003
12004 if (SrcIndex >= NarrowByteWidth)
12005 return std::nullopt;
12006 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12007 }
12008
12009 case ISD::SRA:
12010 case ISD::SRL: {
12011 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12012 if (!ShiftOp)
12013 return std::nullopt;
12014
12015 uint64_t BitShift = ShiftOp->getZExtValue();
12016
12017 if (BitShift % 8 != 0)
12018 return std::nullopt;
12019
12020 SrcIndex += BitShift / 8;
12021
12022 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12023 }
12024
12025 default: {
12026 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12027 }
12028 }
12029 llvm_unreachable("fully handled switch");
12030}
12031
12032// For a byte position in the result of an Or, traverse the tree and find the
12033// node (and the byte of the node) which ultimately provides this {Or,
12034// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12035// the byte position of the Op that corresponds with the originally requested
12036// byte of the Or \p Depth tracks how many recursive iterations we have
12037// performed. \p StartingIndex is the originally requested byte of the Or
12038static const std::optional<ByteProvider<SDValue>>
12039calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12040 unsigned StartingIndex = 0) {
12041 // Finding Src tree of RHS of or typically requires at least 1 additional
12042 // depth
12043 if (Depth > 6)
12044 return std::nullopt;
12045
12046 unsigned BitWidth = Op.getScalarValueSizeInBits();
12047 if (BitWidth % 8 != 0)
12048 return std::nullopt;
12049 if (Index > BitWidth / 8 - 1)
12050 return std::nullopt;
12051
12052 bool IsVec = Op.getValueType().isVector();
12053 switch (Op.getOpcode()) {
12054 case ISD::OR: {
12055 if (IsVec)
12056 return std::nullopt;
12057
12058 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12059 StartingIndex);
12060 if (!RHS)
12061 return std::nullopt;
12062 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12063 StartingIndex);
12064 if (!LHS)
12065 return std::nullopt;
12066 // A well formed Or will have two ByteProviders for each byte, one of which
12067 // is constant zero
12068 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12069 return std::nullopt;
12070 if (!LHS || LHS->isConstantZero())
12071 return RHS;
12072 if (!RHS || RHS->isConstantZero())
12073 return LHS;
12074 return std::nullopt;
12075 }
12076
12077 case ISD::AND: {
12078 if (IsVec)
12079 return std::nullopt;
12080
12081 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12082 if (!BitMaskOp)
12083 return std::nullopt;
12084
12085 uint32_t BitMask = BitMaskOp->getZExtValue();
12086 // Bits we expect for our StartingIndex
12087 uint32_t IndexMask = 0xFF << (Index * 8);
12088
12089 if ((IndexMask & BitMask) != IndexMask) {
12090 // If the result of the and partially provides the byte, then it
12091 // is not well formatted
12092 if (IndexMask & BitMask)
12093 return std::nullopt;
12095 }
12096
12097 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12098 }
12099
12100 case ISD::FSHR: {
12101 if (IsVec)
12102 return std::nullopt;
12103
12104 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12105 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12106 if (!ShiftOp || Op.getValueType().isVector())
12107 return std::nullopt;
12108
12109 uint64_t BitsProvided = Op.getValueSizeInBits();
12110 if (BitsProvided % 8 != 0)
12111 return std::nullopt;
12112
12113 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12114 if (BitShift % 8)
12115 return std::nullopt;
12116
12117 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12118 uint64_t ByteShift = BitShift / 8;
12119
12120 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12121 uint64_t BytesProvided = BitsProvided / 8;
12122 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12123 NewIndex %= BytesProvided;
12124 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12125 }
12126
12127 case ISD::SRA:
12128 case ISD::SRL: {
12129 if (IsVec)
12130 return std::nullopt;
12131
12132 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12133 if (!ShiftOp)
12134 return std::nullopt;
12135
12136 uint64_t BitShift = ShiftOp->getZExtValue();
12137 if (BitShift % 8)
12138 return std::nullopt;
12139
12140 auto BitsProvided = Op.getScalarValueSizeInBits();
12141 if (BitsProvided % 8 != 0)
12142 return std::nullopt;
12143
12144 uint64_t BytesProvided = BitsProvided / 8;
12145 uint64_t ByteShift = BitShift / 8;
12146 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12147 // If the byte we are trying to provide (as tracked by index) falls in this
12148 // range, then the SRL provides the byte. The byte of interest of the src of
12149 // the SRL is Index + ByteShift
12150 return BytesProvided - ByteShift > Index
12151 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12152 Index + ByteShift)
12154 }
12155
12156 case ISD::SHL: {
12157 if (IsVec)
12158 return std::nullopt;
12159
12160 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12161 if (!ShiftOp)
12162 return std::nullopt;
12163
12164 uint64_t BitShift = ShiftOp->getZExtValue();
12165 if (BitShift % 8 != 0)
12166 return std::nullopt;
12167 uint64_t ByteShift = BitShift / 8;
12168
12169 // If we are shifting by an amount greater than (or equal to)
12170 // the index we are trying to provide, then it provides 0s. If not,
12171 // then this bytes are not definitively 0s, and the corresponding byte
12172 // of interest is Index - ByteShift of the src
12173 return Index < ByteShift
12175 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12176 Depth + 1, StartingIndex);
12177 }
12178 case ISD::ANY_EXTEND:
12179 case ISD::SIGN_EXTEND:
12180 case ISD::ZERO_EXTEND:
12182 case ISD::AssertZext:
12183 case ISD::AssertSext: {
12184 if (IsVec)
12185 return std::nullopt;
12186
12187 SDValue NarrowOp = Op->getOperand(0);
12188 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12189 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12190 Op->getOpcode() == ISD::AssertZext ||
12191 Op->getOpcode() == ISD::AssertSext) {
12192 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12193 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12194 }
12195 if (NarrowBitWidth % 8 != 0)
12196 return std::nullopt;
12197 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12198
12199 if (Index >= NarrowByteWidth)
12200 return Op.getOpcode() == ISD::ZERO_EXTEND
12201 ? std::optional<ByteProvider<SDValue>>(
12203 : std::nullopt;
12204 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12205 }
12206
12207 case ISD::TRUNCATE: {
12208 if (IsVec)
12209 return std::nullopt;
12210
12211 uint64_t NarrowByteWidth = BitWidth / 8;
12212
12213 if (NarrowByteWidth >= Index) {
12214 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12215 StartingIndex);
12216 }
12217
12218 return std::nullopt;
12219 }
12220
12221 case ISD::CopyFromReg: {
12222 if (BitWidth / 8 > Index)
12223 return calculateSrcByte(Op, StartingIndex, Index);
12224
12225 return std::nullopt;
12226 }
12227
12228 case ISD::LOAD: {
12229 auto *L = cast<LoadSDNode>(Op.getNode());
12230
12231 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12232 if (NarrowBitWidth % 8 != 0)
12233 return std::nullopt;
12234 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12235
12236 // If the width of the load does not reach byte we are trying to provide for
12237 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12238 // question
12239 if (Index >= NarrowByteWidth) {
12240 return L->getExtensionType() == ISD::ZEXTLOAD
12241 ? std::optional<ByteProvider<SDValue>>(
12243 : std::nullopt;
12244 }
12245
12246 if (NarrowByteWidth > Index) {
12247 return calculateSrcByte(Op, StartingIndex, Index);
12248 }
12249
12250 return std::nullopt;
12251 }
12252
12253 case ISD::BSWAP: {
12254 if (IsVec)
12255 return std::nullopt;
12256
12257 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12258 Depth + 1, StartingIndex);
12259 }
12260
12262 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12263 if (!IdxOp)
12264 return std::nullopt;
12265 auto VecIdx = IdxOp->getZExtValue();
12266 auto ScalarSize = Op.getScalarValueSizeInBits();
12267 if (ScalarSize < 32)
12268 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12269 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12270 StartingIndex, Index);
12271 }
12272
12273 case AMDGPUISD::PERM: {
12274 if (IsVec)
12275 return std::nullopt;
12276
12277 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12278 if (!PermMask)
12279 return std::nullopt;
12280
12281 auto IdxMask =
12282 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12283 if (IdxMask > 0x07 && IdxMask != 0x0c)
12284 return std::nullopt;
12285
12286 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12287 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12288
12289 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12292 }
12293
12294 default: {
12295 return std::nullopt;
12296 }
12297 }
12298
12299 llvm_unreachable("fully handled switch");
12300}
12301
12302// Returns true if the Operand is a scalar and is 16 bits
12303static bool isExtendedFrom16Bits(SDValue &Operand) {
12304
12305 switch (Operand.getOpcode()) {
12306 case ISD::ANY_EXTEND:
12307 case ISD::SIGN_EXTEND:
12308 case ISD::ZERO_EXTEND: {
12309 auto OpVT = Operand.getOperand(0).getValueType();
12310 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12311 }
12312 case ISD::LOAD: {
12313 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12314 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12315 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12316 ExtType == ISD::EXTLOAD) {
12317 auto MemVT = L->getMemoryVT();
12318 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12319 }
12320 return L->getMemoryVT().getSizeInBits() == 16;
12321 }
12322 default:
12323 return false;
12324 }
12325}
12326
12327// Returns true if the mask matches consecutive bytes, and the first byte
12328// begins at a power of 2 byte offset from 0th byte
12329static bool addresses16Bits(int Mask) {
12330 int Low8 = Mask & 0xff;
12331 int Hi8 = (Mask & 0xff00) >> 8;
12332
12333 assert(Low8 < 8 && Hi8 < 8);
12334 // Are the bytes contiguous in the order of increasing addresses.
12335 bool IsConsecutive = (Hi8 - Low8 == 1);
12336 // Is the first byte at location that is aligned for 16 bit instructions.
12337 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12338 // In this case, we still need code to extract the 16 bit operand, so it
12339 // is better to use i8 v_perm
12340 bool Is16Aligned = !(Low8 % 2);
12341
12342 return IsConsecutive && Is16Aligned;
12343}
12344
12345// Do not lower into v_perm if the operands are actually 16 bit
12346// and the selected bits (based on PermMask) correspond with two
12347// easily addressable 16 bit operands.
12349 SDValue &OtherOp) {
12350 int Low16 = PermMask & 0xffff;
12351 int Hi16 = (PermMask & 0xffff0000) >> 16;
12352
12353 auto TempOp = peekThroughBitcasts(Op);
12354 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12355
12356 auto OpIs16Bit =
12357 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12358 if (!OpIs16Bit)
12359 return true;
12360
12361 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12362 isExtendedFrom16Bits(TempOtherOp);
12363 if (!OtherOpIs16Bit)
12364 return true;
12365
12366 // Do we cleanly address both
12367 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12368}
12369
12371 unsigned DWordOffset) {
12372 SDValue Ret;
12373
12374 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12375 // ByteProvider must be at least 8 bits
12376 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12377
12378 if (TypeSize <= 32)
12379 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12380
12381 if (Src.getValueType().isVector()) {
12382 auto ScalarTySize = Src.getScalarValueSizeInBits();
12383 auto ScalarTy = Src.getValueType().getScalarType();
12384 if (ScalarTySize == 32) {
12385 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12386 DAG.getConstant(DWordOffset, SL, MVT::i32));
12387 }
12388 if (ScalarTySize > 32) {
12389 Ret = DAG.getNode(
12390 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12391 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12392 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12393 if (ShiftVal)
12394 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12395 DAG.getConstant(ShiftVal, SL, MVT::i32));
12396 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12397 }
12398
12399 assert(ScalarTySize < 32);
12400 auto NumElements = TypeSize / ScalarTySize;
12401 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12402 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12403 auto NumElementsIn32 = 32 / ScalarTySize;
12404 auto NumAvailElements = DWordOffset < Trunc32Elements
12405 ? NumElementsIn32
12406 : NumElements - NormalizedTrunc;
12407
12409 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12410 NumAvailElements);
12411
12412 Ret = DAG.getBuildVector(
12413 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12414 VecSrcs);
12415 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12416 }
12417
12418 /// Scalar Type
12419 auto ShiftVal = 32 * DWordOffset;
12420 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12421 DAG.getConstant(ShiftVal, SL, MVT::i32));
12422 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12423}
12424
12426 SelectionDAG &DAG = DCI.DAG;
12427 [[maybe_unused]] EVT VT = N->getValueType(0);
12429
12430 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12431 assert(VT == MVT::i32);
12432 for (int i = 0; i < 4; i++) {
12433 // Find the ByteProvider that provides the ith byte of the result of OR
12434 std::optional<ByteProvider<SDValue>> P =
12435 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12436 // TODO support constantZero
12437 if (!P || P->isConstantZero())
12438 return SDValue();
12439
12440 PermNodes.push_back(*P);
12441 }
12442 if (PermNodes.size() != 4)
12443 return SDValue();
12444
12445 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12446 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12447 uint64_t PermMask = 0x00000000;
12448 for (size_t i = 0; i < PermNodes.size(); i++) {
12449 auto PermOp = PermNodes[i];
12450 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12451 // by sizeof(Src2) = 4
12452 int SrcByteAdjust = 4;
12453
12454 // If the Src uses a byte from a different DWORD, then it corresponds
12455 // with a difference source
12456 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12457 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12458 if (SecondSrc)
12459 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12460 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12461 return SDValue();
12462
12463 // Set the index of the second distinct Src node
12464 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12465 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12466 SrcByteAdjust = 0;
12467 }
12468 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12470 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12471 }
12472 SDLoc DL(N);
12473 SDValue Op = *PermNodes[FirstSrc.first].Src;
12474 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12475 assert(Op.getValueSizeInBits() == 32);
12476
12477 // Check that we are not just extracting the bytes in order from an op
12478 if (!SecondSrc) {
12479 int Low16 = PermMask & 0xffff;
12480 int Hi16 = (PermMask & 0xffff0000) >> 16;
12481
12482 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12483 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12484
12485 // The perm op would really just produce Op. So combine into Op
12486 if (WellFormedLow && WellFormedHi)
12487 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12488 }
12489
12490 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12491
12492 if (SecondSrc) {
12493 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12494 assert(OtherOp.getValueSizeInBits() == 32);
12495 }
12496
12497 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12498
12499 assert(Op.getValueType().isByteSized() &&
12500 OtherOp.getValueType().isByteSized());
12501
12502 // If the ultimate src is less than 32 bits, then we will only be
12503 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12504 // CalculateByteProvider would not have returned Op as source if we
12505 // used a byte that is outside its ValueType. Thus, we are free to
12506 // ANY_EXTEND as the extended bits are dont-cares.
12507 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12508 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12509
12510 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12511 DAG.getConstant(PermMask, DL, MVT::i32));
12512 }
12513 return SDValue();
12514}
12515
12516SDValue SITargetLowering::performOrCombine(SDNode *N,
12517 DAGCombinerInfo &DCI) const {
12518 SelectionDAG &DAG = DCI.DAG;
12519 SDValue LHS = N->getOperand(0);
12520 SDValue RHS = N->getOperand(1);
12521
12522 EVT VT = N->getValueType(0);
12523 if (VT == MVT::i1) {
12524 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12525 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12526 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12527 SDValue Src = LHS.getOperand(0);
12528 if (Src != RHS.getOperand(0))
12529 return SDValue();
12530
12531 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12532 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12533 if (!CLHS || !CRHS)
12534 return SDValue();
12535
12536 // Only 10 bits are used.
12537 static const uint32_t MaxMask = 0x3ff;
12538
12539 uint32_t NewMask =
12540 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12541 SDLoc DL(N);
12542 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
12543 DAG.getConstant(NewMask, DL, MVT::i32));
12544 }
12545
12546 return SDValue();
12547 }
12548
12549 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12550 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12551 LHS.getOpcode() == AMDGPUISD::PERM &&
12552 isa<ConstantSDNode>(LHS.getOperand(2))) {
12553 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12554 if (!Sel)
12555 return SDValue();
12556
12557 Sel |= LHS.getConstantOperandVal(2);
12558 SDLoc DL(N);
12559 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12560 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12561 }
12562
12563 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12565 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12566 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12567
12568 // If all the uses of an or need to extract the individual elements, do not
12569 // attempt to lower into v_perm
12570 auto usesCombinedOperand = [](SDNode *OrUse) {
12571 // If we have any non-vectorized use, then it is a candidate for v_perm
12572 if (OrUse->getOpcode() != ISD::BITCAST ||
12573 !OrUse->getValueType(0).isVector())
12574 return true;
12575
12576 // If we have any non-vectorized use, then it is a candidate for v_perm
12577 for (auto *VUser : OrUse->users()) {
12578 if (!VUser->getValueType(0).isVector())
12579 return true;
12580
12581 // If the use of a vector is a store, then combining via a v_perm
12582 // is beneficial.
12583 // TODO -- whitelist more uses
12584 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12585 if (VUser->getOpcode() == VectorwiseOp)
12586 return true;
12587 }
12588 return false;
12589 };
12590
12591 if (!any_of(N->users(), usesCombinedOperand))
12592 return SDValue();
12593
12594 uint32_t LHSMask = getPermuteMask(LHS);
12595 uint32_t RHSMask = getPermuteMask(RHS);
12596
12597 if (LHSMask != ~0u && RHSMask != ~0u) {
12598 // Canonicalize the expression in an attempt to have fewer unique masks
12599 // and therefore fewer registers used to hold the masks.
12600 if (LHSMask > RHSMask) {
12601 std::swap(LHSMask, RHSMask);
12602 std::swap(LHS, RHS);
12603 }
12604
12605 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12606 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12607 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12608 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12609
12610 // Check of we need to combine values from two sources within a byte.
12611 if (!(LHSUsedLanes & RHSUsedLanes) &&
12612 // If we select high and lower word keep it for SDWA.
12613 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12614 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12615 // Kill zero bytes selected by other mask. Zero value is 0xc.
12616 LHSMask &= ~RHSUsedLanes;
12617 RHSMask &= ~LHSUsedLanes;
12618 // Add 4 to each active LHS lane
12619 LHSMask |= LHSUsedLanes & 0x04040404;
12620 // Combine masks
12621 uint32_t Sel = LHSMask | RHSMask;
12622 SDLoc DL(N);
12623
12624 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12625 RHS.getOperand(0),
12626 DAG.getConstant(Sel, DL, MVT::i32));
12627 }
12628 }
12629 if (LHSMask == ~0u || RHSMask == ~0u) {
12630 if (SDValue Perm = matchPERM(N, DCI))
12631 return Perm;
12632 }
12633 }
12634
12635 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12636 return SDValue();
12637
12638 // TODO: This could be a generic combine with a predicate for extracting the
12639 // high half of an integer being free.
12640
12641 // (or i64:x, (zero_extend i32:y)) ->
12642 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12643 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12644 RHS.getOpcode() != ISD::ZERO_EXTEND)
12645 std::swap(LHS, RHS);
12646
12647 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12648 SDValue ExtSrc = RHS.getOperand(0);
12649 EVT SrcVT = ExtSrc.getValueType();
12650 if (SrcVT == MVT::i32) {
12651 SDLoc SL(N);
12652 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
12653 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12654
12655 DCI.AddToWorklist(LowOr.getNode());
12656 DCI.AddToWorklist(HiBits.getNode());
12657
12658 SDValue Vec =
12659 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
12660 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12661 }
12662 }
12663
12664 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12665 if (CRHS) {
12666 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12667 N->getOperand(0), CRHS))
12668 return Split;
12669 }
12670
12671 return SDValue();
12672}
12673
12674SDValue SITargetLowering::performXorCombine(SDNode *N,
12675 DAGCombinerInfo &DCI) const {
12676 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12677 return RV;
12678
12679 SDValue LHS = N->getOperand(0);
12680 SDValue RHS = N->getOperand(1);
12681
12682 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12683 SelectionDAG &DAG = DCI.DAG;
12684
12685 EVT VT = N->getValueType(0);
12686 if (CRHS && VT == MVT::i64) {
12687 if (SDValue Split =
12688 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12689 return Split;
12690 }
12691
12692 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12693 // fneg-like xors into 64-bit select.
12694 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12695 // This looks like an fneg, try to fold as a source modifier.
12696 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12697 shouldFoldFNegIntoSrc(N, LHS)) {
12698 // xor (select c, a, b), 0x80000000 ->
12699 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12700 SDLoc DL(N);
12701 SDValue CastLHS =
12702 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12703 SDValue CastRHS =
12704 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12705 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12706 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12707 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12708 LHS->getOperand(0), FNegLHS, FNegRHS);
12709 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12710 }
12711 }
12712
12713 return SDValue();
12714}
12715
12716SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12717 DAGCombinerInfo &DCI) const {
12718 if (!Subtarget->has16BitInsts() ||
12719 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12720 return SDValue();
12721
12722 EVT VT = N->getValueType(0);
12723 if (VT != MVT::i32)
12724 return SDValue();
12725
12726 SDValue Src = N->getOperand(0);
12727 if (Src.getValueType() != MVT::i16)
12728 return SDValue();
12729
12730 return SDValue();
12731}
12732
12733SDValue
12734SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12735 DAGCombinerInfo &DCI) const {
12736 SDValue Src = N->getOperand(0);
12737 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12738
12739 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12740 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12741 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12742 VTSign->getVT() == MVT::i8) ||
12743 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12744 VTSign->getVT() == MVT::i16))) {
12745 assert(Subtarget->hasScalarSubwordLoads() &&
12746 "s_buffer_load_{u8, i8} are supported "
12747 "in GFX12 (or newer) architectures.");
12748 EVT VT = Src.getValueType();
12749 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12752 SDLoc DL(N);
12753 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12754 SDValue Ops[] = {
12755 Src.getOperand(0), // source register
12756 Src.getOperand(1), // offset
12757 Src.getOperand(2) // cachePolicy
12758 };
12759 auto *M = cast<MemSDNode>(Src);
12760 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12761 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12762 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12763 return LoadVal;
12764 }
12765 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12766 VTSign->getVT() == MVT::i8) ||
12767 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12768 VTSign->getVT() == MVT::i16)) &&
12769 Src.hasOneUse()) {
12770 auto *M = cast<MemSDNode>(Src);
12771 SDValue Ops[] = {Src.getOperand(0), // Chain
12772 Src.getOperand(1), // rsrc
12773 Src.getOperand(2), // vindex
12774 Src.getOperand(3), // voffset
12775 Src.getOperand(4), // soffset
12776 Src.getOperand(5), // offset
12777 Src.getOperand(6), Src.getOperand(7)};
12778 // replace with BUFFER_LOAD_BYTE/SHORT
12779 SDVTList ResList =
12780 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12781 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
12784 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12785 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12786 return DCI.DAG.getMergeValues(
12787 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
12788 }
12789 return SDValue();
12790}
12791
12792SDValue SITargetLowering::performClassCombine(SDNode *N,
12793 DAGCombinerInfo &DCI) const {
12794 SelectionDAG &DAG = DCI.DAG;
12795 SDValue Mask = N->getOperand(1);
12796
12797 // fp_class x, 0 -> false
12798 if (isNullConstant(Mask))
12799 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12800
12801 if (N->getOperand(0).isUndef())
12802 return DAG.getUNDEF(MVT::i1);
12803
12804 return SDValue();
12805}
12806
12807SDValue SITargetLowering::performRcpCombine(SDNode *N,
12808 DAGCombinerInfo &DCI) const {
12809 EVT VT = N->getValueType(0);
12810 SDValue N0 = N->getOperand(0);
12811
12812 if (N0.isUndef()) {
12813 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12814 SDLoc(N), VT);
12815 }
12816
12817 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12818 N0.getOpcode() == ISD::SINT_TO_FP)) {
12819 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12820 N->getFlags());
12821 }
12822
12823 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12824 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12825 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12826 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
12827 N->getFlags());
12828 }
12829
12831}
12832
12834 unsigned MaxDepth) const {
12835 unsigned Opcode = Op.getOpcode();
12836 if (Opcode == ISD::FCANONICALIZE)
12837 return true;
12838
12839 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12840 const auto &F = CFP->getValueAPF();
12841 if (F.isNaN() && F.isSignaling())
12842 return false;
12843 if (!F.isDenormal())
12844 return true;
12845
12846 DenormalMode Mode =
12847 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12848 return Mode == DenormalMode::getIEEE();
12849 }
12850
12851 // If source is a result of another standard FP operation it is already in
12852 // canonical form.
12853 if (MaxDepth == 0)
12854 return false;
12855
12856 switch (Opcode) {
12857 // These will flush denorms if required.
12858 case ISD::FADD:
12859 case ISD::FSUB:
12860 case ISD::FMUL:
12861 case ISD::FCEIL:
12862 case ISD::FFLOOR:
12863 case ISD::FMA:
12864 case ISD::FMAD:
12865 case ISD::FSQRT:
12866 case ISD::FDIV:
12867 case ISD::FREM:
12868 case ISD::FP_ROUND:
12869 case ISD::FP_EXTEND:
12870 case ISD::FP16_TO_FP:
12871 case ISD::FP_TO_FP16:
12872 case ISD::BF16_TO_FP:
12873 case ISD::FP_TO_BF16:
12874 case ISD::FLDEXP:
12877 case AMDGPUISD::RCP:
12878 case AMDGPUISD::RSQ:
12882 case AMDGPUISD::LOG:
12883 case AMDGPUISD::EXP:
12887 case AMDGPUISD::FRACT:
12894 case AMDGPUISD::SIN_HW:
12895 case AMDGPUISD::COS_HW:
12896 return true;
12897
12898 // It can/will be lowered or combined as a bit operation.
12899 // Need to check their input recursively to handle.
12900 case ISD::FNEG:
12901 case ISD::FABS:
12902 case ISD::FCOPYSIGN:
12903 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12904
12905 case ISD::AND:
12906 if (Op.getValueType() == MVT::i32) {
12907 // Be careful as we only know it is a bitcast floating point type. It
12908 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12909 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12910 // is valid to optimize for all types.
12911 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12912 if (RHS->getZExtValue() == 0xffff0000) {
12913 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12914 }
12915 }
12916 }
12917 break;
12918
12919 case ISD::FSIN:
12920 case ISD::FCOS:
12921 case ISD::FSINCOS:
12922 return Op.getValueType().getScalarType() != MVT::f16;
12923
12924 case ISD::FMINNUM:
12925 case ISD::FMAXNUM:
12926 case ISD::FMINNUM_IEEE:
12927 case ISD::FMAXNUM_IEEE:
12928 case ISD::FMINIMUM:
12929 case ISD::FMAXIMUM:
12930 case AMDGPUISD::CLAMP:
12931 case AMDGPUISD::FMED3:
12932 case AMDGPUISD::FMAX3:
12933 case AMDGPUISD::FMIN3:
12935 case AMDGPUISD::FMINIMUM3: {
12936 // FIXME: Shouldn't treat the generic operations different based these.
12937 // However, we aren't really required to flush the result from
12938 // minnum/maxnum..
12939
12940 // snans will be quieted, so we only need to worry about denormals.
12941 if (Subtarget->supportsMinMaxDenormModes() ||
12942 // FIXME: denormalsEnabledForType is broken for dynamic
12943 denormalsEnabledForType(DAG, Op.getValueType()))
12944 return true;
12945
12946 // Flushing may be required.
12947 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12948 // targets need to check their input recursively.
12949
12950 // FIXME: Does this apply with clamp? It's implemented with max.
12951 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12952 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12953 return false;
12954 }
12955
12956 return true;
12957 }
12958 case ISD::SELECT: {
12959 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12960 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12961 }
12962 case ISD::BUILD_VECTOR: {
12963 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12964 SDValue SrcOp = Op.getOperand(i);
12965 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12966 return false;
12967 }
12968
12969 return true;
12970 }
12973 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12974 }
12976 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12977 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12978 }
12979 case ISD::UNDEF:
12980 // Could be anything.
12981 return false;
12982
12983 case ISD::BITCAST:
12984 // TODO: This is incorrect as it loses track of the operand's type. We may
12985 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12986 // same bits that are canonicalized in one type need not be in the other.
12987 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12988 case ISD::TRUNCATE: {
12989 // Hack round the mess we make when legalizing extract_vector_elt
12990 if (Op.getValueType() == MVT::i16) {
12991 SDValue TruncSrc = Op.getOperand(0);
12992 if (TruncSrc.getValueType() == MVT::i32 &&
12993 TruncSrc.getOpcode() == ISD::BITCAST &&
12994 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12995 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12996 }
12997 }
12998 return false;
12999 }
13001 unsigned IntrinsicID = Op.getConstantOperandVal(0);
13002 // TODO: Handle more intrinsics
13003 switch (IntrinsicID) {
13004 case Intrinsic::amdgcn_cvt_pkrtz:
13005 case Intrinsic::amdgcn_cubeid:
13006 case Intrinsic::amdgcn_frexp_mant:
13007 case Intrinsic::amdgcn_fdot2:
13008 case Intrinsic::amdgcn_rcp:
13009 case Intrinsic::amdgcn_rsq:
13010 case Intrinsic::amdgcn_rsq_clamp:
13011 case Intrinsic::amdgcn_rcp_legacy:
13012 case Intrinsic::amdgcn_rsq_legacy:
13013 case Intrinsic::amdgcn_trig_preop:
13014 case Intrinsic::amdgcn_log:
13015 case Intrinsic::amdgcn_exp2:
13016 case Intrinsic::amdgcn_sqrt:
13017 return true;
13018 default:
13019 break;
13020 }
13021
13022 break;
13023 }
13024 default:
13025 break;
13026 }
13027
13028 // FIXME: denormalsEnabledForType is broken for dynamic
13029 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13030 DAG.isKnownNeverSNaN(Op);
13031}
13032
13034 unsigned MaxDepth) const {
13035 const MachineRegisterInfo &MRI = MF.getRegInfo();
13036 MachineInstr *MI = MRI.getVRegDef(Reg);
13037 unsigned Opcode = MI->getOpcode();
13038
13039 if (Opcode == AMDGPU::G_FCANONICALIZE)
13040 return true;
13041
13042 std::optional<FPValueAndVReg> FCR;
13043 // Constant splat (can be padded with undef) or scalar constant.
13044 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13045 if (FCR->Value.isSignaling())
13046 return false;
13047 if (!FCR->Value.isDenormal())
13048 return true;
13049
13050 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13051 return Mode == DenormalMode::getIEEE();
13052 }
13053
13054 if (MaxDepth == 0)
13055 return false;
13056
13057 switch (Opcode) {
13058 case AMDGPU::G_FADD:
13059 case AMDGPU::G_FSUB:
13060 case AMDGPU::G_FMUL:
13061 case AMDGPU::G_FCEIL:
13062 case AMDGPU::G_FFLOOR:
13063 case AMDGPU::G_FRINT:
13064 case AMDGPU::G_FNEARBYINT:
13065 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13066 case AMDGPU::G_INTRINSIC_TRUNC:
13067 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13068 case AMDGPU::G_FMA:
13069 case AMDGPU::G_FMAD:
13070 case AMDGPU::G_FSQRT:
13071 case AMDGPU::G_FDIV:
13072 case AMDGPU::G_FREM:
13073 case AMDGPU::G_FPOW:
13074 case AMDGPU::G_FPEXT:
13075 case AMDGPU::G_FLOG:
13076 case AMDGPU::G_FLOG2:
13077 case AMDGPU::G_FLOG10:
13078 case AMDGPU::G_FPTRUNC:
13079 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13080 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13081 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13082 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13083 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13084 return true;
13085 case AMDGPU::G_FNEG:
13086 case AMDGPU::G_FABS:
13087 case AMDGPU::G_FCOPYSIGN:
13088 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13089 case AMDGPU::G_FMINNUM:
13090 case AMDGPU::G_FMAXNUM:
13091 case AMDGPU::G_FMINNUM_IEEE:
13092 case AMDGPU::G_FMAXNUM_IEEE:
13093 case AMDGPU::G_FMINIMUM:
13094 case AMDGPU::G_FMAXIMUM: {
13095 if (Subtarget->supportsMinMaxDenormModes() ||
13096 // FIXME: denormalsEnabledForType is broken for dynamic
13097 denormalsEnabledForType(MRI.getType(Reg), MF))
13098 return true;
13099
13100 [[fallthrough]];
13101 }
13102 case AMDGPU::G_BUILD_VECTOR:
13103 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13104 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13105 return false;
13106 return true;
13107 case AMDGPU::G_INTRINSIC:
13108 case AMDGPU::G_INTRINSIC_CONVERGENT:
13109 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13110 case Intrinsic::amdgcn_fmul_legacy:
13111 case Intrinsic::amdgcn_fmad_ftz:
13112 case Intrinsic::amdgcn_sqrt:
13113 case Intrinsic::amdgcn_fmed3:
13114 case Intrinsic::amdgcn_sin:
13115 case Intrinsic::amdgcn_cos:
13116 case Intrinsic::amdgcn_log:
13117 case Intrinsic::amdgcn_exp2:
13118 case Intrinsic::amdgcn_log_clamp:
13119 case Intrinsic::amdgcn_rcp:
13120 case Intrinsic::amdgcn_rcp_legacy:
13121 case Intrinsic::amdgcn_rsq:
13122 case Intrinsic::amdgcn_rsq_clamp:
13123 case Intrinsic::amdgcn_rsq_legacy:
13124 case Intrinsic::amdgcn_div_scale:
13125 case Intrinsic::amdgcn_div_fmas:
13126 case Intrinsic::amdgcn_div_fixup:
13127 case Intrinsic::amdgcn_fract:
13128 case Intrinsic::amdgcn_cvt_pkrtz:
13129 case Intrinsic::amdgcn_cubeid:
13130 case Intrinsic::amdgcn_cubema:
13131 case Intrinsic::amdgcn_cubesc:
13132 case Intrinsic::amdgcn_cubetc:
13133 case Intrinsic::amdgcn_frexp_mant:
13134 case Intrinsic::amdgcn_fdot2:
13135 case Intrinsic::amdgcn_trig_preop:
13136 return true;
13137 default:
13138 break;
13139 }
13140
13141 [[fallthrough]];
13142 default:
13143 return false;
13144 }
13145
13146 llvm_unreachable("invalid operation");
13147}
13148
13149// Constant fold canonicalize.
13150SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13151 const SDLoc &SL, EVT VT,
13152 const APFloat &C) const {
13153 // Flush denormals to 0 if not enabled.
13154 if (C.isDenormal()) {
13155 DenormalMode Mode =
13156 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13157 if (Mode == DenormalMode::getPreserveSign()) {
13158 return DAG.getConstantFP(
13159 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13160 }
13161
13162 if (Mode != DenormalMode::getIEEE())
13163 return SDValue();
13164 }
13165
13166 if (C.isNaN()) {
13167 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13168 if (C.isSignaling()) {
13169 // Quiet a signaling NaN.
13170 // FIXME: Is this supposed to preserve payload bits?
13171 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13172 }
13173
13174 // Make sure it is the canonical NaN bitpattern.
13175 //
13176 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13177 // immediate?
13178 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13179 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13180 }
13181
13182 // Already canonical.
13183 return DAG.getConstantFP(C, SL, VT);
13184}
13185
13187 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13188}
13189
13190SDValue
13191SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13192 DAGCombinerInfo &DCI) const {
13193 SelectionDAG &DAG = DCI.DAG;
13194 SDValue N0 = N->getOperand(0);
13195 EVT VT = N->getValueType(0);
13196
13197 // fcanonicalize undef -> qnan
13198 if (N0.isUndef()) {
13200 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13201 }
13202
13203 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13204 EVT VT = N->getValueType(0);
13205 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13206 }
13207
13208 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13209 // (fcanonicalize k)
13210 //
13211 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13212
13213 // TODO: This could be better with wider vectors that will be split to v2f16,
13214 // and to consider uses since there aren't that many packed operations.
13215 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13216 isTypeLegal(MVT::v2f16)) {
13217 SDLoc SL(N);
13218 SDValue NewElts[2];
13219 SDValue Lo = N0.getOperand(0);
13220 SDValue Hi = N0.getOperand(1);
13221 EVT EltVT = Lo.getValueType();
13222
13224 for (unsigned I = 0; I != 2; ++I) {
13225 SDValue Op = N0.getOperand(I);
13226 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13227 NewElts[I] =
13228 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13229 } else if (Op.isUndef()) {
13230 // Handled below based on what the other operand is.
13231 NewElts[I] = Op;
13232 } else {
13233 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13234 }
13235 }
13236
13237 // If one half is undef, and one is constant, prefer a splat vector rather
13238 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13239 // cheaper to use and may be free with a packed operation.
13240 if (NewElts[0].isUndef()) {
13241 if (isa<ConstantFPSDNode>(NewElts[1]))
13242 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13243 ? NewElts[1]
13244 : DAG.getConstantFP(0.0f, SL, EltVT);
13245 }
13246
13247 if (NewElts[1].isUndef()) {
13248 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13249 ? NewElts[0]
13250 : DAG.getConstantFP(0.0f, SL, EltVT);
13251 }
13252
13253 return DAG.getBuildVector(VT, SL, NewElts);
13254 }
13255 }
13256
13257 return SDValue();
13258}
13259
13260static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13261 switch (Opc) {
13262 case ISD::FMAXNUM:
13263 case ISD::FMAXNUM_IEEE:
13264 return AMDGPUISD::FMAX3;
13265 case ISD::FMAXIMUM:
13266 return AMDGPUISD::FMAXIMUM3;
13267 case ISD::SMAX:
13268 return AMDGPUISD::SMAX3;
13269 case ISD::UMAX:
13270 return AMDGPUISD::UMAX3;
13271 case ISD::FMINNUM:
13272 case ISD::FMINNUM_IEEE:
13273 return AMDGPUISD::FMIN3;
13274 case ISD::FMINIMUM:
13275 return AMDGPUISD::FMINIMUM3;
13276 case ISD::SMIN:
13277 return AMDGPUISD::SMIN3;
13278 case ISD::UMIN:
13279 return AMDGPUISD::UMIN3;
13280 default:
13281 llvm_unreachable("Not a min/max opcode");
13282 }
13283}
13284
13285SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13286 const SDLoc &SL, SDValue Src,
13287 SDValue MinVal,
13288 SDValue MaxVal,
13289 bool Signed) const {
13290
13291 // med3 comes from
13292 // min(max(x, K0), K1), K0 < K1
13293 // max(min(x, K0), K1), K1 < K0
13294 //
13295 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13296 // min/max op.
13297 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13298 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13299
13300 if (!MinK || !MaxK)
13301 return SDValue();
13302
13303 if (Signed) {
13304 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13305 return SDValue();
13306 } else {
13307 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13308 return SDValue();
13309 }
13310
13311 EVT VT = MinK->getValueType(0);
13312 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13313 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13314 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13315
13316 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13317 // not available, but this is unlikely to be profitable as constants
13318 // will often need to be materialized & extended, especially on
13319 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13320 return SDValue();
13321}
13322
13324 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13325 return C;
13326
13327 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13328 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13329 return C;
13330 }
13331
13332 return nullptr;
13333}
13334
13335SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13336 const SDLoc &SL, SDValue Op0,
13337 SDValue Op1) const {
13339 if (!K1)
13340 return SDValue();
13341
13343 if (!K0)
13344 return SDValue();
13345
13346 // Ordered >= (although NaN inputs should have folded away by now).
13347 if (K0->getValueAPF() > K1->getValueAPF())
13348 return SDValue();
13349
13350 const MachineFunction &MF = DAG.getMachineFunction();
13352
13353 // TODO: Check IEEE bit enabled?
13354 EVT VT = Op0.getValueType();
13355 if (Info->getMode().DX10Clamp) {
13356 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13357 // hardware fmed3 behavior converting to a min.
13358 // FIXME: Should this be allowing -0.0?
13359 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13360 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13361 }
13362
13363 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13364 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13365 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13366 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13367 // then give the other result, which is different from med3 with a NaN
13368 // input.
13369 SDValue Var = Op0.getOperand(0);
13370 if (!DAG.isKnownNeverSNaN(Var))
13371 return SDValue();
13372
13374
13375 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13376 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13377 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13378 SDValue(K0, 0), SDValue(K1, 0));
13379 }
13380 }
13381
13382 return SDValue();
13383}
13384
13385/// \return true if the subtarget supports minimum3 and maximum3 with the given
13386/// base min/max opcode \p Opc for type \p VT.
13387static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13388 EVT VT) {
13389 switch (Opc) {
13390 case ISD::FMINNUM:
13391 case ISD::FMAXNUM:
13392 case ISD::FMINNUM_IEEE:
13393 case ISD::FMAXNUM_IEEE:
13396 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13397 case ISD::FMINIMUM:
13398 case ISD::FMAXIMUM:
13399 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13400 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
13401 case ISD::SMAX:
13402 case ISD::SMIN:
13403 case ISD::UMAX:
13404 case ISD::UMIN:
13405 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13406 default:
13407 return false;
13408 }
13409
13410 llvm_unreachable("not a min/max opcode");
13411}
13412
13413SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13414 DAGCombinerInfo &DCI) const {
13415 SelectionDAG &DAG = DCI.DAG;
13416
13417 EVT VT = N->getValueType(0);
13418 unsigned Opc = N->getOpcode();
13419 SDValue Op0 = N->getOperand(0);
13420 SDValue Op1 = N->getOperand(1);
13421
13422 // Only do this if the inner op has one use since this will just increases
13423 // register pressure for no benefit.
13424
13425 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13426 // max(max(a, b), c) -> max3(a, b, c)
13427 // min(min(a, b), c) -> min3(a, b, c)
13428 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13429 SDLoc DL(N);
13430 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13431 Op0.getOperand(0), Op0.getOperand(1), Op1);
13432 }
13433
13434 // Try commuted.
13435 // max(a, max(b, c)) -> max3(a, b, c)
13436 // min(a, min(b, c)) -> min3(a, b, c)
13437 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13438 SDLoc DL(N);
13439 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13440 Op0, Op1.getOperand(0), Op1.getOperand(1));
13441 }
13442 }
13443
13444 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13445 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13446 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13447 if (SDValue Med3 = performIntMed3ImmCombine(
13448 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13449 return Med3;
13450 }
13451 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13452 if (SDValue Med3 = performIntMed3ImmCombine(
13453 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13454 return Med3;
13455 }
13456
13457 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13458 if (SDValue Med3 = performIntMed3ImmCombine(
13459 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13460 return Med3;
13461 }
13462 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13463 if (SDValue Med3 = performIntMed3ImmCombine(
13464 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13465 return Med3;
13466 }
13467
13468 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13469 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13470 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13471 (Opc == AMDGPUISD::FMIN_LEGACY &&
13472 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13473 (VT == MVT::f32 || VT == MVT::f64 ||
13474 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13475 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13476 Op0.hasOneUse()) {
13477 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13478 return Res;
13479 }
13480
13481 return SDValue();
13482}
13483
13485 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13486 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13487 // FIXME: Should this be allowing -0.0?
13488 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13489 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13490 }
13491 }
13492
13493 return false;
13494}
13495
13496// FIXME: Should only worry about snans for version with chain.
13497SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13498 DAGCombinerInfo &DCI) const {
13499 EVT VT = N->getValueType(0);
13500 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13501 // NaNs. With a NaN input, the order of the operands may change the result.
13502
13503 SelectionDAG &DAG = DCI.DAG;
13504 SDLoc SL(N);
13505
13506 SDValue Src0 = N->getOperand(0);
13507 SDValue Src1 = N->getOperand(1);
13508 SDValue Src2 = N->getOperand(2);
13509
13510 if (isClampZeroToOne(Src0, Src1)) {
13511 // const_a, const_b, x -> clamp is safe in all cases including signaling
13512 // nans.
13513 // FIXME: Should this be allowing -0.0?
13514 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13515 }
13516
13517 const MachineFunction &MF = DAG.getMachineFunction();
13519
13520 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13521 // handling no dx10-clamp?
13522 if (Info->getMode().DX10Clamp) {
13523 // If NaNs is clamped to 0, we are free to reorder the inputs.
13524
13525 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13526 std::swap(Src0, Src1);
13527
13528 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13529 std::swap(Src1, Src2);
13530
13531 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13532 std::swap(Src0, Src1);
13533
13534 if (isClampZeroToOne(Src1, Src2))
13535 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13536 }
13537
13538 return SDValue();
13539}
13540
13541SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13542 DAGCombinerInfo &DCI) const {
13543 SDValue Src0 = N->getOperand(0);
13544 SDValue Src1 = N->getOperand(1);
13545 if (Src0.isUndef() && Src1.isUndef())
13546 return DCI.DAG.getUNDEF(N->getValueType(0));
13547 return SDValue();
13548}
13549
13550// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13551// expanded into a set of cmp/select instructions.
13553 unsigned NumElem,
13554 bool IsDivergentIdx,
13555 const GCNSubtarget *Subtarget) {
13557 return false;
13558
13559 unsigned VecSize = EltSize * NumElem;
13560
13561 // Sub-dword vectors of size 2 dword or less have better implementation.
13562 if (VecSize <= 64 && EltSize < 32)
13563 return false;
13564
13565 // Always expand the rest of sub-dword instructions, otherwise it will be
13566 // lowered via memory.
13567 if (EltSize < 32)
13568 return true;
13569
13570 // Always do this if var-idx is divergent, otherwise it will become a loop.
13571 if (IsDivergentIdx)
13572 return true;
13573
13574 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13575 unsigned NumInsts = NumElem /* Number of compares */ +
13576 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13577
13578 // On some architectures (GFX9) movrel is not available and it's better
13579 // to expand.
13580 if (Subtarget->useVGPRIndexMode())
13581 return NumInsts <= 16;
13582
13583 // If movrel is available, use it instead of expanding for vector of 8
13584 // elements.
13585 if (Subtarget->hasMovrel())
13586 return NumInsts <= 15;
13587
13588 return true;
13589}
13590
13592 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13593 if (isa<ConstantSDNode>(Idx))
13594 return false;
13595
13596 SDValue Vec = N->getOperand(0);
13597 EVT VecVT = Vec.getValueType();
13598 EVT EltVT = VecVT.getVectorElementType();
13599 unsigned EltSize = EltVT.getSizeInBits();
13600 unsigned NumElem = VecVT.getVectorNumElements();
13601
13603 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13604}
13605
13606SDValue
13607SITargetLowering::performExtractVectorEltCombine(SDNode *N,
13608 DAGCombinerInfo &DCI) const {
13609 SDValue Vec = N->getOperand(0);
13610 SelectionDAG &DAG = DCI.DAG;
13611
13612 EVT VecVT = Vec.getValueType();
13613 EVT VecEltVT = VecVT.getVectorElementType();
13614 EVT ResVT = N->getValueType(0);
13615
13616 unsigned VecSize = VecVT.getSizeInBits();
13617 unsigned VecEltSize = VecEltVT.getSizeInBits();
13618
13619 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
13621 SDLoc SL(N);
13622 SDValue Idx = N->getOperand(1);
13623 SDValue Elt =
13624 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13625 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13626 }
13627
13628 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13629 // =>
13630 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13631 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13632 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13633 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13634 SDLoc SL(N);
13635 SDValue Idx = N->getOperand(1);
13636 unsigned Opc = Vec.getOpcode();
13637
13638 switch (Opc) {
13639 default:
13640 break;
13641 // TODO: Support other binary operations.
13642 case ISD::FADD:
13643 case ISD::FSUB:
13644 case ISD::FMUL:
13645 case ISD::ADD:
13646 case ISD::UMIN:
13647 case ISD::UMAX:
13648 case ISD::SMIN:
13649 case ISD::SMAX:
13650 case ISD::FMAXNUM:
13651 case ISD::FMINNUM:
13652 case ISD::FMAXNUM_IEEE:
13653 case ISD::FMINNUM_IEEE:
13654 case ISD::FMAXIMUM:
13655 case ISD::FMINIMUM: {
13656 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13657 Vec.getOperand(0), Idx);
13658 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13659 Vec.getOperand(1), Idx);
13660
13661 DCI.AddToWorklist(Elt0.getNode());
13662 DCI.AddToWorklist(Elt1.getNode());
13663 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13664 }
13665 }
13666 }
13667
13668 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13670 SDLoc SL(N);
13671 SDValue Idx = N->getOperand(1);
13672 SDValue V;
13673 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13674 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13675 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13676 if (I == 0)
13677 V = Elt;
13678 else
13679 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13680 }
13681 return V;
13682 }
13683
13684 if (!DCI.isBeforeLegalize())
13685 return SDValue();
13686
13687 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13688 // elements. This exposes more load reduction opportunities by replacing
13689 // multiple small extract_vector_elements with a single 32-bit extract.
13690 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13691 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13692 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13693 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13694
13695 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13696 unsigned EltIdx = BitIndex / 32;
13697 unsigned LeftoverBitIdx = BitIndex % 32;
13698 SDLoc SL(N);
13699
13700 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13701 DCI.AddToWorklist(Cast.getNode());
13702
13703 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13704 DAG.getConstant(EltIdx, SL, MVT::i32));
13705 DCI.AddToWorklist(Elt.getNode());
13706 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13707 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13708 DCI.AddToWorklist(Srl.getNode());
13709
13710 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13711 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13712 DCI.AddToWorklist(Trunc.getNode());
13713
13714 if (VecEltVT == ResVT) {
13715 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13716 }
13717
13718 assert(ResVT.isScalarInteger());
13719 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13720 }
13721
13722 return SDValue();
13723}
13724
13725SDValue
13726SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13727 DAGCombinerInfo &DCI) const {
13728 SDValue Vec = N->getOperand(0);
13729 SDValue Idx = N->getOperand(2);
13730 EVT VecVT = Vec.getValueType();
13731 EVT EltVT = VecVT.getVectorElementType();
13732
13733 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13734 // => BUILD_VECTOR n x select (e, const-idx)
13736 return SDValue();
13737
13738 SelectionDAG &DAG = DCI.DAG;
13739 SDLoc SL(N);
13740 SDValue Ins = N->getOperand(1);
13741 EVT IdxVT = Idx.getValueType();
13742
13744 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13745 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13746 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13747 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13748 Ops.push_back(V);
13749 }
13750
13751 return DAG.getBuildVector(VecVT, SL, Ops);
13752}
13753
13754/// Return the source of an fp_extend from f16 to f32, or a converted FP
13755/// constant.
13757 if (Src.getOpcode() == ISD::FP_EXTEND &&
13758 Src.getOperand(0).getValueType() == MVT::f16) {
13759 return Src.getOperand(0);
13760 }
13761
13762 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13763 APFloat Val = CFP->getValueAPF();
13764 bool LosesInfo = true;
13766 if (!LosesInfo)
13767 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13768 }
13769
13770 return SDValue();
13771}
13772
13773SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13774 DAGCombinerInfo &DCI) const {
13775 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13776 "combine only useful on gfx8");
13777
13778 SDValue TruncSrc = N->getOperand(0);
13779 EVT VT = N->getValueType(0);
13780 if (VT != MVT::f16)
13781 return SDValue();
13782
13783 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13784 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13785 return SDValue();
13786
13787 SelectionDAG &DAG = DCI.DAG;
13788 SDLoc SL(N);
13789
13790 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13791 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13792 // casting back.
13793
13794 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13795 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13796 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13797 if (!A)
13798 return SDValue();
13799
13800 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13801 if (!B)
13802 return SDValue();
13803
13804 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13805 if (!C)
13806 return SDValue();
13807
13808 // This changes signaling nan behavior. If an input is a signaling nan, it
13809 // would have been quieted by the fpext originally. We don't care because
13810 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13811 // we would be worse off than just doing the promotion.
13812 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13813 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13814 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13815 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13816}
13817
13818unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13819 const SDNode *N0,
13820 const SDNode *N1) const {
13821 EVT VT = N0->getValueType(0);
13822
13823 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13824 // support denormals ever.
13825 if (((VT == MVT::f32 &&
13827 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13830 return ISD::FMAD;
13831
13832 const TargetOptions &Options = DAG.getTarget().Options;
13833 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13834 (N0->getFlags().hasAllowContract() &&
13835 N1->getFlags().hasAllowContract())) &&
13837 return ISD::FMA;
13838 }
13839
13840 return 0;
13841}
13842
13843// For a reassociatable opcode perform:
13844// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13845SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13846 SelectionDAG &DAG) const {
13847 EVT VT = N->getValueType(0);
13848 if (VT != MVT::i32 && VT != MVT::i64)
13849 return SDValue();
13850
13851 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13852 return SDValue();
13853
13854 unsigned Opc = N->getOpcode();
13855 SDValue Op0 = N->getOperand(0);
13856 SDValue Op1 = N->getOperand(1);
13857
13858 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13859 return SDValue();
13860
13861 if (Op0->isDivergent())
13862 std::swap(Op0, Op1);
13863
13864 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13865 return SDValue();
13866
13867 SDValue Op2 = Op1.getOperand(1);
13868 Op1 = Op1.getOperand(0);
13869 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13870 return SDValue();
13871
13872 if (Op1->isDivergent())
13873 std::swap(Op1, Op2);
13874
13875 SDLoc SL(N);
13876 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13877 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13878}
13879
13880static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
13881 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
13883 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13884 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13885 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13886}
13887
13888// Fold
13889// y = lshr i64 x, 32
13890// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
13891// with Const.hi == -1
13892// To
13893// res = mad_u64_u32 y.lo ,Const.lo, x.lo
13895 SDValue MulLHS, SDValue MulRHS,
13896 SDValue AddRHS) {
13897 if (MulRHS.getOpcode() == ISD::SRL)
13898 std::swap(MulLHS, MulRHS);
13899
13900 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
13901 return SDValue();
13902
13903 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
13904 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
13905 MulLHS.getOperand(0) != AddRHS)
13906 return SDValue();
13907
13908 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
13909 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
13910 return SDValue();
13911
13912 SDValue ConstMul =
13913 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
13914 return getMad64_32(DAG, SL, MVT::i64,
13915 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
13916 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
13917}
13918
13919// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13920// multiplies, if any.
13921//
13922// Full 64-bit multiplies that feed into an addition are lowered here instead
13923// of using the generic expansion. The generic expansion ends up with
13924// a tree of ADD nodes that prevents us from using the "add" part of the
13925// MAD instruction. The expansion produced here results in a chain of ADDs
13926// instead of a tree.
13927SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13928 DAGCombinerInfo &DCI) const {
13929 assert(N->getOpcode() == ISD::ADD);
13930
13931 SelectionDAG &DAG = DCI.DAG;
13932 EVT VT = N->getValueType(0);
13933 SDLoc SL(N);
13934 SDValue LHS = N->getOperand(0);
13935 SDValue RHS = N->getOperand(1);
13936
13937 if (VT.isVector())
13938 return SDValue();
13939
13940 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13941 // result in scalar registers for uniform values.
13942 if (!N->isDivergent() && Subtarget->hasSMulHi())
13943 return SDValue();
13944
13945 unsigned NumBits = VT.getScalarSizeInBits();
13946 if (NumBits <= 32 || NumBits > 64)
13947 return SDValue();
13948
13949 if (LHS.getOpcode() != ISD::MUL) {
13950 assert(RHS.getOpcode() == ISD::MUL);
13951 std::swap(LHS, RHS);
13952 }
13953
13954 // Avoid the fold if it would unduly increase the number of multiplies due to
13955 // multiple uses, except on hardware with full-rate multiply-add (which is
13956 // part of full-rate 64-bit ops).
13957 if (!Subtarget->hasFullRate64Ops()) {
13958 unsigned NumUsers = 0;
13959 for (SDNode *User : LHS->users()) {
13960 // There is a use that does not feed into addition, so the multiply can't
13961 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13962 if (User->getOpcode() != ISD::ADD)
13963 return SDValue();
13964
13965 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13966 // MUL + 3xADD + 3xADDC over 3xMAD.
13967 ++NumUsers;
13968 if (NumUsers >= 3)
13969 return SDValue();
13970 }
13971 }
13972
13973 SDValue MulLHS = LHS.getOperand(0);
13974 SDValue MulRHS = LHS.getOperand(1);
13975 SDValue AddRHS = RHS;
13976
13977 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
13978 return FoldedMAD;
13979
13980 // Always check whether operands are small unsigned values, since that
13981 // knowledge is useful in more cases. Check for small signed values only if
13982 // doing so can unlock a shorter code sequence.
13983 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13984 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13985
13986 bool MulSignedLo = false;
13987 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13988 MulSignedLo =
13989 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
13990 }
13991
13992 // The operands and final result all have the same number of bits. If
13993 // operands need to be extended, they can be extended with garbage. The
13994 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13995 // truncated away in the end.
13996 if (VT != MVT::i64) {
13997 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13998 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13999 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
14000 }
14001
14002 // The basic code generated is conceptually straightforward. Pseudo code:
14003 //
14004 // accum = mad_64_32 lhs.lo, rhs.lo, accum
14005 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14006 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14007 //
14008 // The second and third lines are optional, depending on whether the factors
14009 // are {sign,zero}-extended or not.
14010 //
14011 // The actual DAG is noisier than the pseudo code, but only due to
14012 // instructions that disassemble values into low and high parts, and
14013 // assemble the final result.
14014 SDValue One = DAG.getConstant(1, SL, MVT::i32);
14015
14016 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14017 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14018 SDValue Accum =
14019 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14020
14021 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14022 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14023
14024 if (!MulLHSUnsigned32) {
14025 auto MulLHSHi =
14026 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14027 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14028 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14029 }
14030
14031 if (!MulRHSUnsigned32) {
14032 auto MulRHSHi =
14033 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
14034 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
14035 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14036 }
14037
14038 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
14039 Accum = DAG.getBitcast(MVT::i64, Accum);
14040 }
14041
14042 if (VT != MVT::i64)
14043 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14044 return Accum;
14045}
14046
14047SDValue
14048SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14049 DAGCombinerInfo &DCI) const {
14050 SDValue RHS = N->getOperand(1);
14051 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14052 if (!CRHS)
14053 return SDValue();
14054
14055 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14056 // common.
14057 uint64_t Val = CRHS->getZExtValue();
14058 if (countr_zero(Val) >= 32) {
14059 SelectionDAG &DAG = DCI.DAG;
14060 SDLoc SL(N);
14061 SDValue LHS = N->getOperand(0);
14062
14063 // Avoid carry machinery if we know the low half of the add does not
14064 // contribute to the final result.
14065 //
14066 // add i64:x, K if computeTrailingZeros(K) >= 32
14067 // => build_pair (add x.hi, K.hi), x.lo
14068
14069 // Breaking the 64-bit add here with this strange constant is unlikely
14070 // to interfere with addressing mode patterns.
14071
14072 SDValue Hi = getHiHalf64(LHS, DAG);
14073 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14074 SDValue AddHi =
14075 DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14076
14077 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14078 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14079 }
14080
14081 return SDValue();
14082}
14083
14084// Collect the ultimate src of each of the mul node's operands, and confirm
14085// each operand is 8 bytes.
14086static std::optional<ByteProvider<SDValue>>
14087handleMulOperand(const SDValue &MulOperand) {
14088 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14089 if (!Byte0 || Byte0->isConstantZero()) {
14090 return std::nullopt;
14091 }
14092 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14093 if (Byte1 && !Byte1->isConstantZero()) {
14094 return std::nullopt;
14095 }
14096 return Byte0;
14097}
14098
14099static unsigned addPermMasks(unsigned First, unsigned Second) {
14100 unsigned FirstCs = First & 0x0c0c0c0c;
14101 unsigned SecondCs = Second & 0x0c0c0c0c;
14102 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14103 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14104
14105 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14106 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14107 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14108 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14109
14110 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14111}
14112
14113struct DotSrc {
14115 int64_t PermMask;
14117};
14118
14122 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14123
14124 assert(Src0.Src.has_value() && Src1.Src.has_value());
14125 // Src0s and Src1s are empty, just place arbitrarily.
14126 if (Step == 0) {
14127 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14128 Src0.SrcOffset / 4});
14129 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14130 Src1.SrcOffset / 4});
14131 return;
14132 }
14133
14134 for (int BPI = 0; BPI < 2; BPI++) {
14135 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14136 if (BPI == 1) {
14137 BPP = {Src1, Src0};
14138 }
14139 unsigned ZeroMask = 0x0c0c0c0c;
14140 unsigned FMask = 0xFF << (8 * (3 - Step));
14141
14142 unsigned FirstMask =
14143 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14144 unsigned SecondMask =
14145 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14146 // Attempt to find Src vector which contains our SDValue, if so, add our
14147 // perm mask to the existing one. If we are unable to find a match for the
14148 // first SDValue, attempt to find match for the second.
14149 int FirstGroup = -1;
14150 for (int I = 0; I < 2; I++) {
14151 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14152 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14153 return IterElt.SrcOp == *BPP.first.Src &&
14154 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14155 };
14156
14157 auto *Match = llvm::find_if(Srcs, MatchesFirst);
14158 if (Match != Srcs.end()) {
14159 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14160 FirstGroup = I;
14161 break;
14162 }
14163 }
14164 if (FirstGroup != -1) {
14165 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14166 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14167 return IterElt.SrcOp == *BPP.second.Src &&
14168 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14169 };
14170 auto *Match = llvm::find_if(Srcs, MatchesSecond);
14171 if (Match != Srcs.end()) {
14172 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14173 } else
14174 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14175 return;
14176 }
14177 }
14178
14179 // If we have made it here, then we could not find a match in Src0s or Src1s
14180 // for either Src0 or Src1, so just place them arbitrarily.
14181
14182 unsigned ZeroMask = 0x0c0c0c0c;
14183 unsigned FMask = 0xFF << (8 * (3 - Step));
14184
14185 Src0s.push_back(
14186 {*Src0.Src,
14187 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14188 Src0.SrcOffset / 4});
14189 Src1s.push_back(
14190 {*Src1.Src,
14191 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14192 Src1.SrcOffset / 4});
14193}
14194
14196 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14197 bool IsAny) {
14198
14199 // If we just have one source, just permute it accordingly.
14200 if (Srcs.size() == 1) {
14201 auto *Elt = Srcs.begin();
14202 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14203
14204 // v_perm will produce the original value
14205 if (Elt->PermMask == 0x3020100)
14206 return EltOp;
14207
14208 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14209 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14210 }
14211
14212 auto *FirstElt = Srcs.begin();
14213 auto *SecondElt = std::next(FirstElt);
14214
14216
14217 // If we have multiple sources in the chain, combine them via perms (using
14218 // calculated perm mask) and Ors.
14219 while (true) {
14220 auto FirstMask = FirstElt->PermMask;
14221 auto SecondMask = SecondElt->PermMask;
14222
14223 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14224 unsigned FirstPlusFour = FirstMask | 0x04040404;
14225 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14226 // original 0x0C.
14227 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14228
14229 auto PermMask = addPermMasks(FirstMask, SecondMask);
14230 auto FirstVal =
14231 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14232 auto SecondVal =
14233 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14234
14235 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14236 SecondVal,
14237 DAG.getConstant(PermMask, SL, MVT::i32)));
14238
14239 FirstElt = std::next(SecondElt);
14240 if (FirstElt == Srcs.end())
14241 break;
14242
14243 SecondElt = std::next(FirstElt);
14244 // If we only have a FirstElt, then just combine that into the cumulative
14245 // source node.
14246 if (SecondElt == Srcs.end()) {
14247 auto EltOp =
14248 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14249
14250 Perms.push_back(
14251 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14252 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14253 break;
14254 }
14255 }
14256
14257 assert(Perms.size() == 1 || Perms.size() == 2);
14258 return Perms.size() == 2
14259 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14260 : Perms[0];
14261}
14262
14263static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14264 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14265 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14266 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14267 EntryMask += ZeroMask;
14268 }
14269}
14270
14271static bool isMul(const SDValue Op) {
14272 auto Opcode = Op.getOpcode();
14273
14274 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14275 Opcode == AMDGPUISD::MUL_I24);
14276}
14277
14278static std::optional<bool>
14280 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14281 const SDValue &S1Op, const SelectionDAG &DAG) {
14282 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14283 // of the dot4 is irrelevant.
14284 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14285 return false;
14286
14287 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14288 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14289 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14290 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14291 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14292 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14293
14294 assert(!(S0IsUnsigned && S0IsSigned));
14295 assert(!(S1IsUnsigned && S1IsSigned));
14296
14297 // There are 9 possible permutations of
14298 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14299
14300 // In two permutations, the sign bits are known to be the same for both Ops,
14301 // so simply return Signed / Unsigned corresponding to the MSB
14302
14303 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14304 return S0IsSigned;
14305
14306 // In another two permutations, the sign bits are known to be opposite. In
14307 // this case return std::nullopt to indicate a bad match.
14308
14309 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14310 return std::nullopt;
14311
14312 // In the remaining five permutations, we don't know the value of the sign
14313 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14314 // the upper bits must be extension bits. Thus, the only ways for the sign
14315 // bit to be unknown is if it was sign extended from unknown value, or if it
14316 // was any extended. In either case, it is correct to use the signed
14317 // version of the signedness semantics of dot4
14318
14319 // In two of such permutations, we known the sign bit is set for
14320 // one op, and the other is unknown. It is okay to used signed version of
14321 // dot4.
14322 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14323 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14324 return true;
14325
14326 // In one such permutation, we don't know either of the sign bits. It is okay
14327 // to used the signed version of dot4.
14328 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14329 return true;
14330
14331 // In two of such permutations, we known the sign bit is unset for
14332 // one op, and the other is unknown. Return std::nullopt to indicate a
14333 // bad match.
14334 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14335 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14336 return std::nullopt;
14337
14338 llvm_unreachable("Fully covered condition");
14339}
14340
14341SDValue SITargetLowering::performAddCombine(SDNode *N,
14342 DAGCombinerInfo &DCI) const {
14343 SelectionDAG &DAG = DCI.DAG;
14344 EVT VT = N->getValueType(0);
14345 SDLoc SL(N);
14346 SDValue LHS = N->getOperand(0);
14347 SDValue RHS = N->getOperand(1);
14348
14349 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14350 if (Subtarget->hasMad64_32()) {
14351 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14352 return Folded;
14353 }
14354 }
14355
14356 if (SDValue V = reassociateScalarOps(N, DAG)) {
14357 return V;
14358 }
14359
14360 if (VT == MVT::i64) {
14361 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14362 return Folded;
14363 }
14364
14365 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14366 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14367 SDValue TempNode(N, 0);
14368 std::optional<bool> IsSigned;
14372
14373 // Match the v_dot4 tree, while collecting src nodes.
14374 int ChainLength = 0;
14375 for (int I = 0; I < 4; I++) {
14376 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14377 if (MulIdx == -1)
14378 break;
14379 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14380 if (!Src0)
14381 break;
14382 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14383 if (!Src1)
14384 break;
14385
14386 auto IterIsSigned = checkDot4MulSignedness(
14387 TempNode->getOperand(MulIdx), *Src0, *Src1,
14388 TempNode->getOperand(MulIdx)->getOperand(0),
14389 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14390 if (!IterIsSigned)
14391 break;
14392 if (!IsSigned)
14393 IsSigned = *IterIsSigned;
14394 if (*IterIsSigned != *IsSigned)
14395 break;
14396 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14397 auto AddIdx = 1 - MulIdx;
14398 // Allow the special case where add (add (mul24, 0), mul24) became ->
14399 // add (mul24, mul24).
14400 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14401 Src2s.push_back(TempNode->getOperand(AddIdx));
14402 auto Src0 =
14403 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14404 if (!Src0)
14405 break;
14406 auto Src1 =
14407 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14408 if (!Src1)
14409 break;
14410 auto IterIsSigned = checkDot4MulSignedness(
14411 TempNode->getOperand(AddIdx), *Src0, *Src1,
14412 TempNode->getOperand(AddIdx)->getOperand(0),
14413 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14414 if (!IterIsSigned)
14415 break;
14416 assert(IsSigned);
14417 if (*IterIsSigned != *IsSigned)
14418 break;
14419 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14420 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14421 ChainLength = I + 2;
14422 break;
14423 }
14424
14425 TempNode = TempNode->getOperand(AddIdx);
14426 Src2s.push_back(TempNode);
14427 ChainLength = I + 1;
14428 if (TempNode->getNumOperands() < 2)
14429 break;
14430 LHS = TempNode->getOperand(0);
14431 RHS = TempNode->getOperand(1);
14432 }
14433
14434 if (ChainLength < 2)
14435 return SDValue();
14436
14437 // Masks were constructed with assumption that we would find a chain of
14438 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14439 // 0x0c) so they do not affect dot calculation.
14440 if (ChainLength < 4) {
14441 fixMasks(Src0s, ChainLength);
14442 fixMasks(Src1s, ChainLength);
14443 }
14444
14445 SDValue Src0, Src1;
14446
14447 // If we are just using a single source for both, and have permuted the
14448 // bytes consistently, we can just use the sources without permuting
14449 // (commutation).
14450 bool UseOriginalSrc = false;
14451 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14452 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14453 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14454 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14455 SmallVector<unsigned, 4> SrcBytes;
14456 auto Src0Mask = Src0s.begin()->PermMask;
14457 SrcBytes.push_back(Src0Mask & 0xFF000000);
14458 bool UniqueEntries = true;
14459 for (auto I = 1; I < 4; I++) {
14460 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14461
14462 if (is_contained(SrcBytes, NextByte)) {
14463 UniqueEntries = false;
14464 break;
14465 }
14466 SrcBytes.push_back(NextByte);
14467 }
14468
14469 if (UniqueEntries) {
14470 UseOriginalSrc = true;
14471
14472 auto *FirstElt = Src0s.begin();
14473 auto FirstEltOp =
14474 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14475
14476 auto *SecondElt = Src1s.begin();
14477 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14478 SecondElt->DWordOffset);
14479
14480 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14481 MVT::getIntegerVT(32));
14482 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14483 MVT::getIntegerVT(32));
14484 }
14485 }
14486
14487 if (!UseOriginalSrc) {
14488 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14489 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14490 }
14491
14492 assert(IsSigned);
14493 SDValue Src2 =
14494 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14495
14496 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14497 : Intrinsic::amdgcn_udot4,
14498 SL, MVT::i64);
14499
14500 assert(!VT.isVector());
14501 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14502 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14503
14504 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14505 }
14506
14507 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14508 return SDValue();
14509
14510 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14511 // add x, sext (setcc) => usubo_carry x, 0, setcc
14512 unsigned Opc = LHS.getOpcode();
14513 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14514 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14515 std::swap(RHS, LHS);
14516
14517 Opc = RHS.getOpcode();
14518 switch (Opc) {
14519 default:
14520 break;
14521 case ISD::ZERO_EXTEND:
14522 case ISD::SIGN_EXTEND:
14523 case ISD::ANY_EXTEND: {
14524 auto Cond = RHS.getOperand(0);
14525 // If this won't be a real VOPC output, we would still need to insert an
14526 // extra instruction anyway.
14527 if (!isBoolSGPR(Cond))
14528 break;
14529 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14530 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14532 return DAG.getNode(Opc, SL, VTList, Args);
14533 }
14534 case ISD::UADDO_CARRY: {
14535 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14536 if (!isNullConstant(RHS.getOperand(1)))
14537 break;
14538 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
14539 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14540 }
14541 }
14542 return SDValue();
14543}
14544
14545SDValue SITargetLowering::performSubCombine(SDNode *N,
14546 DAGCombinerInfo &DCI) const {
14547 SelectionDAG &DAG = DCI.DAG;
14548 EVT VT = N->getValueType(0);
14549
14550 if (VT == MVT::i64) {
14551 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14552 return Folded;
14553 }
14554
14555 if (VT != MVT::i32)
14556 return SDValue();
14557
14558 SDLoc SL(N);
14559 SDValue LHS = N->getOperand(0);
14560 SDValue RHS = N->getOperand(1);
14561
14562 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14563 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14564 unsigned Opc = RHS.getOpcode();
14565 switch (Opc) {
14566 default:
14567 break;
14568 case ISD::ZERO_EXTEND:
14569 case ISD::SIGN_EXTEND:
14570 case ISD::ANY_EXTEND: {
14571 auto Cond = RHS.getOperand(0);
14572 // If this won't be a real VOPC output, we would still need to insert an
14573 // extra instruction anyway.
14574 if (!isBoolSGPR(Cond))
14575 break;
14576 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14577 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14579 return DAG.getNode(Opc, SL, VTList, Args);
14580 }
14581 }
14582
14583 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14584 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14585 if (!isNullConstant(LHS.getOperand(1)))
14586 return SDValue();
14587 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
14588 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14589 }
14590 return SDValue();
14591}
14592
14593SDValue
14594SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14595 DAGCombinerInfo &DCI) const {
14596
14597 if (N->getValueType(0) != MVT::i32)
14598 return SDValue();
14599
14600 if (!isNullConstant(N->getOperand(1)))
14601 return SDValue();
14602
14603 SelectionDAG &DAG = DCI.DAG;
14604 SDValue LHS = N->getOperand(0);
14605
14606 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14607 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14608 unsigned LHSOpc = LHS.getOpcode();
14609 unsigned Opc = N->getOpcode();
14610 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14611 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14612 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
14613 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14614 }
14615 return SDValue();
14616}
14617
14618SDValue SITargetLowering::performFAddCombine(SDNode *N,
14619 DAGCombinerInfo &DCI) const {
14620 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14621 return SDValue();
14622
14623 SelectionDAG &DAG = DCI.DAG;
14624 EVT VT = N->getValueType(0);
14625
14626 SDLoc SL(N);
14627 SDValue LHS = N->getOperand(0);
14628 SDValue RHS = N->getOperand(1);
14629
14630 // These should really be instruction patterns, but writing patterns with
14631 // source modifiers is a pain.
14632
14633 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14634 if (LHS.getOpcode() == ISD::FADD) {
14635 SDValue A = LHS.getOperand(0);
14636 if (A == LHS.getOperand(1)) {
14637 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14638 if (FusedOp != 0) {
14639 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14640 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14641 }
14642 }
14643 }
14644
14645 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14646 if (RHS.getOpcode() == ISD::FADD) {
14647 SDValue A = RHS.getOperand(0);
14648 if (A == RHS.getOperand(1)) {
14649 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14650 if (FusedOp != 0) {
14651 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14652 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14653 }
14654 }
14655 }
14656
14657 return SDValue();
14658}
14659
14660SDValue SITargetLowering::performFSubCombine(SDNode *N,
14661 DAGCombinerInfo &DCI) const {
14662 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14663 return SDValue();
14664
14665 SelectionDAG &DAG = DCI.DAG;
14666 SDLoc SL(N);
14667 EVT VT = N->getValueType(0);
14668 assert(!VT.isVector());
14669
14670 // Try to get the fneg to fold into the source modifier. This undoes generic
14671 // DAG combines and folds them into the mad.
14672 //
14673 // Only do this if we are not trying to support denormals. v_mad_f32 does
14674 // not support denormals ever.
14675 SDValue LHS = N->getOperand(0);
14676 SDValue RHS = N->getOperand(1);
14677 if (LHS.getOpcode() == ISD::FADD) {
14678 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14679 SDValue A = LHS.getOperand(0);
14680 if (A == LHS.getOperand(1)) {
14681 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14682 if (FusedOp != 0) {
14683 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14684 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14685
14686 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14687 }
14688 }
14689 }
14690
14691 if (RHS.getOpcode() == ISD::FADD) {
14692 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14693
14694 SDValue A = RHS.getOperand(0);
14695 if (A == RHS.getOperand(1)) {
14696 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14697 if (FusedOp != 0) {
14698 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14699 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14700 }
14701 }
14702 }
14703
14704 return SDValue();
14705}
14706
14707SDValue SITargetLowering::performFDivCombine(SDNode *N,
14708 DAGCombinerInfo &DCI) const {
14709 SelectionDAG &DAG = DCI.DAG;
14710 SDLoc SL(N);
14711 EVT VT = N->getValueType(0);
14712 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14713 return SDValue();
14714
14715 SDValue LHS = N->getOperand(0);
14716 SDValue RHS = N->getOperand(1);
14717
14718 SDNodeFlags Flags = N->getFlags();
14719 SDNodeFlags RHSFlags = RHS->getFlags();
14720 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14721 !RHS->hasOneUse())
14722 return SDValue();
14723
14724 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14725 bool IsNegative = false;
14726 if (CLHS->isExactlyValue(1.0) ||
14727 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14728 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14729 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14730 if (RHS.getOpcode() == ISD::FSQRT) {
14731 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14732 SDValue Rsq =
14733 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14734 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14735 }
14736 }
14737 }
14738
14739 return SDValue();
14740}
14741
14742SDValue SITargetLowering::performFMulCombine(SDNode *N,
14743 DAGCombinerInfo &DCI) const {
14744 SelectionDAG &DAG = DCI.DAG;
14745 EVT VT = N->getValueType(0);
14746 EVT ScalarVT = VT.getScalarType();
14747 EVT IntVT = VT.changeElementType(MVT::i32);
14748
14749 SDValue LHS = N->getOperand(0);
14750 SDValue RHS = N->getOperand(1);
14751
14752 // It is cheaper to realize i32 inline constants as compared against
14753 // materializing f16 or f64 (or even non-inline f32) values,
14754 // possible via ldexp usage, as shown below :
14755 //
14756 // Given : A = 2^a & B = 2^b ; where a and b are integers.
14757 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
14758 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
14759 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14760 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14761 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14762 if (!TrueNode)
14763 return SDValue();
14764 const ConstantFPSDNode *FalseNode =
14765 isConstOrConstSplatFP(RHS.getOperand(2));
14766 if (!FalseNode)
14767 return SDValue();
14768
14769 if (TrueNode->isNegative() != FalseNode->isNegative())
14770 return SDValue();
14771
14772 // For f32, only non-inline constants should be transformed.
14774 if (ScalarVT == MVT::f32 &&
14775 TII->isInlineConstant(TrueNode->getValueAPF()) &&
14776 TII->isInlineConstant(FalseNode->getValueAPF()))
14777 return SDValue();
14778
14779 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14780 if (TrueNodeExpVal == INT_MIN)
14781 return SDValue();
14782 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14783 if (FalseNodeExpVal == INT_MIN)
14784 return SDValue();
14785
14786 SDLoc SL(N);
14787 SDValue SelectNode =
14788 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14789 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14790 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14791
14792 LHS = TrueNode->isNegative()
14793 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14794 : LHS;
14795
14796 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14797 }
14798
14799 return SDValue();
14800}
14801
14802SDValue SITargetLowering::performFMACombine(SDNode *N,
14803 DAGCombinerInfo &DCI) const {
14804 SelectionDAG &DAG = DCI.DAG;
14805 EVT VT = N->getValueType(0);
14806 SDLoc SL(N);
14807
14808 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
14809 return SDValue();
14810
14811 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14812 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14813 SDValue Op1 = N->getOperand(0);
14814 SDValue Op2 = N->getOperand(1);
14815 SDValue FMA = N->getOperand(2);
14816
14817 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
14818 Op2.getOpcode() != ISD::FP_EXTEND)
14819 return SDValue();
14820
14821 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14822 // regardless of the denorm mode setting. Therefore,
14823 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14824 const TargetOptions &Options = DAG.getTarget().Options;
14825 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14826 (N->getFlags().hasAllowContract() &&
14827 FMA->getFlags().hasAllowContract())) {
14828 Op1 = Op1.getOperand(0);
14829 Op2 = Op2.getOperand(0);
14830 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14832 return SDValue();
14833
14834 SDValue Vec1 = Op1.getOperand(0);
14835 SDValue Idx1 = Op1.getOperand(1);
14836 SDValue Vec2 = Op2.getOperand(0);
14837
14838 SDValue FMAOp1 = FMA.getOperand(0);
14839 SDValue FMAOp2 = FMA.getOperand(1);
14840 SDValue FMAAcc = FMA.getOperand(2);
14841
14842 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14843 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14844 return SDValue();
14845
14846 FMAOp1 = FMAOp1.getOperand(0);
14847 FMAOp2 = FMAOp2.getOperand(0);
14848 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14850 return SDValue();
14851
14852 SDValue Vec3 = FMAOp1.getOperand(0);
14853 SDValue Vec4 = FMAOp2.getOperand(0);
14854 SDValue Idx2 = FMAOp1.getOperand(1);
14855
14856 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14857 // Idx1 and Idx2 cannot be the same.
14858 Idx1 == Idx2)
14859 return SDValue();
14860
14861 if (Vec1 == Vec2 || Vec3 == Vec4)
14862 return SDValue();
14863
14864 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14865 return SDValue();
14866
14867 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14868 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14869 DAG.getTargetConstant(0, SL, MVT::i1));
14870 }
14871 }
14872 return SDValue();
14873}
14874
14875SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14876 DAGCombinerInfo &DCI) const {
14877 SelectionDAG &DAG = DCI.DAG;
14878 SDLoc SL(N);
14879
14880 SDValue LHS = N->getOperand(0);
14881 SDValue RHS = N->getOperand(1);
14882 EVT VT = LHS.getValueType();
14883 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14884
14885 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14886 if (!CRHS) {
14887 CRHS = dyn_cast<ConstantSDNode>(LHS);
14888 if (CRHS) {
14889 std::swap(LHS, RHS);
14891 }
14892 }
14893
14894 if (CRHS) {
14895 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14896 isBoolSGPR(LHS.getOperand(0))) {
14897 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14898 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14899 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14900 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14901 if ((CRHS->isAllOnes() &&
14902 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14903 (CRHS->isZero() &&
14904 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14905 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14906 DAG.getAllOnesConstant(SL, MVT::i1));
14907 if ((CRHS->isAllOnes() &&
14908 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14909 (CRHS->isZero() &&
14910 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14911 return LHS.getOperand(0);
14912 }
14913
14914 const APInt &CRHSVal = CRHS->getAPIntValue();
14915 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14916 LHS.getOpcode() == ISD::SELECT &&
14917 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14918 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14919 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14920 isBoolSGPR(LHS.getOperand(0))) {
14921 // Given CT != FT:
14922 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14923 // setcc (select cc, CT, CF), CF, ne => cc
14924 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14925 // setcc (select cc, CT, CF), CT, eq => cc
14926 const APInt &CT = LHS.getConstantOperandAPInt(1);
14927 const APInt &CF = LHS.getConstantOperandAPInt(2);
14928
14929 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14930 (CT == CRHSVal && CC == ISD::SETNE))
14931 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14932 DAG.getAllOnesConstant(SL, MVT::i1));
14933 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14934 (CT == CRHSVal && CC == ISD::SETEQ))
14935 return LHS.getOperand(0);
14936 }
14937 }
14938
14939 if (VT != MVT::f32 && VT != MVT::f64 &&
14940 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14941 return SDValue();
14942
14943 // Match isinf/isfinite pattern
14944 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14945 // (fcmp one (fabs x), inf) -> (fp_class x,
14946 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14947 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
14948 LHS.getOpcode() == ISD::FABS) {
14949 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14950 if (!CRHS)
14951 return SDValue();
14952
14953 const APFloat &APF = CRHS->getValueAPF();
14954 if (APF.isInfinity() && !APF.isNegative()) {
14955 const unsigned IsInfMask =
14957 const unsigned IsFiniteMask =
14961 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14962 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14963 DAG.getConstant(Mask, SL, MVT::i32));
14964 }
14965 }
14966
14967 return SDValue();
14968}
14969
14970SDValue
14971SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14972 DAGCombinerInfo &DCI) const {
14973 SelectionDAG &DAG = DCI.DAG;
14974 SDLoc SL(N);
14975 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14976
14977 SDValue Src = N->getOperand(0);
14978 SDValue Shift = N->getOperand(0);
14979
14980 // TODO: Extend type shouldn't matter (assuming legal types).
14981 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14982 Shift = Shift.getOperand(0);
14983
14984 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14985 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14986 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14987 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14988 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14989 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14990 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14991 SDValue Shifted = DAG.getZExtOrTrunc(
14992 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
14993
14994 unsigned ShiftOffset = 8 * Offset;
14995 if (Shift.getOpcode() == ISD::SHL)
14996 ShiftOffset -= C->getZExtValue();
14997 else
14998 ShiftOffset += C->getZExtValue();
14999
15000 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15001 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
15002 MVT::f32, Shifted);
15003 }
15004 }
15005 }
15006
15007 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15008 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
15009 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
15010 // We simplified Src. If this node is not dead, visit it again so it is
15011 // folded properly.
15012 if (N->getOpcode() != ISD::DELETED_NODE)
15013 DCI.AddToWorklist(N);
15014 return SDValue(N, 0);
15015 }
15016
15017 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
15018 if (SDValue DemandedSrc =
15020 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
15021
15022 return SDValue();
15023}
15024
15025SDValue SITargetLowering::performClampCombine(SDNode *N,
15026 DAGCombinerInfo &DCI) const {
15027 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
15028 if (!CSrc)
15029 return SDValue();
15030
15031 const MachineFunction &MF = DCI.DAG.getMachineFunction();
15032 const APFloat &F = CSrc->getValueAPF();
15033 APFloat Zero = APFloat::getZero(F.getSemantics());
15034 if (F < Zero ||
15035 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
15036 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
15037 }
15038
15039 APFloat One(F.getSemantics(), "1.0");
15040 if (F > One)
15041 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
15042
15043 return SDValue(CSrc, 0);
15044}
15045
15047 DAGCombinerInfo &DCI) const {
15048 switch (N->getOpcode()) {
15049 case ISD::ADD:
15050 case ISD::SUB:
15051 case ISD::SHL:
15052 case ISD::SRL:
15053 case ISD::SRA:
15054 case ISD::AND:
15055 case ISD::OR:
15056 case ISD::XOR:
15057 case ISD::MUL:
15058 case ISD::SETCC:
15059 case ISD::SELECT:
15060 case ISD::SMIN:
15061 case ISD::SMAX:
15062 case ISD::UMIN:
15063 case ISD::UMAX:
15064 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
15065 return Res;
15066 break;
15067 default:
15068 break;
15069 }
15070
15071 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15072 return SDValue();
15073
15074 switch (N->getOpcode()) {
15075 case ISD::ADD:
15076 return performAddCombine(N, DCI);
15077 case ISD::SUB:
15078 return performSubCombine(N, DCI);
15079 case ISD::UADDO_CARRY:
15080 case ISD::USUBO_CARRY:
15081 return performAddCarrySubCarryCombine(N, DCI);
15082 case ISD::FADD:
15083 return performFAddCombine(N, DCI);
15084 case ISD::FSUB:
15085 return performFSubCombine(N, DCI);
15086 case ISD::FDIV:
15087 return performFDivCombine(N, DCI);
15088 case ISD::FMUL:
15089 return performFMulCombine(N, DCI);
15090 case ISD::SETCC:
15091 return performSetCCCombine(N, DCI);
15092 case ISD::FMAXNUM:
15093 case ISD::FMINNUM:
15094 case ISD::FMAXNUM_IEEE:
15095 case ISD::FMINNUM_IEEE:
15096 case ISD::FMAXIMUM:
15097 case ISD::FMINIMUM:
15098 case ISD::SMAX:
15099 case ISD::SMIN:
15100 case ISD::UMAX:
15101 case ISD::UMIN:
15104 return performMinMaxCombine(N, DCI);
15105 case ISD::FMA:
15106 return performFMACombine(N, DCI);
15107 case ISD::AND:
15108 return performAndCombine(N, DCI);
15109 case ISD::OR:
15110 return performOrCombine(N, DCI);
15111 case ISD::FSHR: {
15113 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15114 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15115 return matchPERM(N, DCI);
15116 }
15117 break;
15118 }
15119 case ISD::XOR:
15120 return performXorCombine(N, DCI);
15121 case ISD::ZERO_EXTEND:
15122 return performZeroExtendCombine(N, DCI);
15124 return performSignExtendInRegCombine(N, DCI);
15126 return performClassCombine(N, DCI);
15127 case ISD::FCANONICALIZE:
15128 return performFCanonicalizeCombine(N, DCI);
15129 case AMDGPUISD::RCP:
15130 return performRcpCombine(N, DCI);
15131 case ISD::FLDEXP:
15132 case AMDGPUISD::FRACT:
15133 case AMDGPUISD::RSQ:
15136 case AMDGPUISD::RSQ_CLAMP: {
15137 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15138 SDValue Src = N->getOperand(0);
15139 if (Src.isUndef())
15140 return Src;
15141 break;
15142 }
15143 case ISD::SINT_TO_FP:
15144 case ISD::UINT_TO_FP:
15145 return performUCharToFloatCombine(N, DCI);
15146 case ISD::FCOPYSIGN:
15147 return performFCopySignCombine(N, DCI);
15152 return performCvtF32UByteNCombine(N, DCI);
15153 case AMDGPUISD::FMED3:
15154 return performFMed3Combine(N, DCI);
15156 return performCvtPkRTZCombine(N, DCI);
15157 case AMDGPUISD::CLAMP:
15158 return performClampCombine(N, DCI);
15159 case ISD::SCALAR_TO_VECTOR: {
15160 SelectionDAG &DAG = DCI.DAG;
15161 EVT VT = N->getValueType(0);
15162
15163 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15164 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15165 SDLoc SL(N);
15166 SDValue Src = N->getOperand(0);
15167 EVT EltVT = Src.getValueType();
15168 if (EltVT != MVT::i16)
15169 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15170
15171 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15172 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15173 }
15174
15175 break;
15176 }
15178 return performExtractVectorEltCombine(N, DCI);
15180 return performInsertVectorEltCombine(N, DCI);
15181 case ISD::FP_ROUND:
15182 return performFPRoundCombine(N, DCI);
15183 case ISD::LOAD: {
15184 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15185 return Widened;
15186 [[fallthrough]];
15187 }
15188 default: {
15189 if (!DCI.isBeforeLegalize()) {
15190 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15191 return performMemSDNodeCombine(MemNode, DCI);
15192 }
15193
15194 break;
15195 }
15196 }
15197
15199}
15200
15201/// Helper function for adjustWritemask
15202static unsigned SubIdx2Lane(unsigned Idx) {
15203 switch (Idx) {
15204 default:
15205 return ~0u;
15206 case AMDGPU::sub0:
15207 return 0;
15208 case AMDGPU::sub1:
15209 return 1;
15210 case AMDGPU::sub2:
15211 return 2;
15212 case AMDGPU::sub3:
15213 return 3;
15214 case AMDGPU::sub4:
15215 return 4; // Possible with TFE/LWE
15216 }
15217}
15218
15219/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15220SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15221 SelectionDAG &DAG) const {
15222 unsigned Opcode = Node->getMachineOpcode();
15223
15224 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15225 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15226 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15227 return Node; // not implemented for D16
15228
15229 SDNode *Users[5] = {nullptr};
15230 unsigned Lane = 0;
15231 unsigned DmaskIdx =
15232 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15233 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15234 unsigned NewDmask = 0;
15235 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15236 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15237 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15238 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
15239 ? true
15240 : false;
15241 unsigned TFCLane = 0;
15242 bool HasChain = Node->getNumValues() > 1;
15243
15244 if (OldDmask == 0) {
15245 // These are folded out, but on the chance it happens don't assert.
15246 return Node;
15247 }
15248
15249 unsigned OldBitsSet = llvm::popcount(OldDmask);
15250 // Work out which is the TFE/LWE lane if that is enabled.
15251 if (UsesTFC) {
15252 TFCLane = OldBitsSet;
15253 }
15254
15255 // Try to figure out the used register components
15256 for (SDUse &Use : Node->uses()) {
15257
15258 // Don't look at users of the chain.
15259 if (Use.getResNo() != 0)
15260 continue;
15261
15262 SDNode *User = Use.getUser();
15263
15264 // Abort if we can't understand the usage
15265 if (!User->isMachineOpcode() ||
15266 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15267 return Node;
15268
15269 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15270 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15271 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15272 // set, etc.
15273 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15274 if (Lane == ~0u)
15275 return Node;
15276
15277 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15278 if (UsesTFC && Lane == TFCLane) {
15279 Users[Lane] = User;
15280 } else {
15281 // Set which texture component corresponds to the lane.
15282 unsigned Comp;
15283 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15284 Comp = llvm::countr_zero(Dmask);
15285 Dmask &= ~(1 << Comp);
15286 }
15287
15288 // Abort if we have more than one user per component.
15289 if (Users[Lane])
15290 return Node;
15291
15292 Users[Lane] = User;
15293 NewDmask |= 1 << Comp;
15294 }
15295 }
15296
15297 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15298 bool NoChannels = !NewDmask;
15299 if (NoChannels) {
15300 if (!UsesTFC) {
15301 // No uses of the result and not using TFC. Then do nothing.
15302 return Node;
15303 }
15304 // If the original dmask has one channel - then nothing to do
15305 if (OldBitsSet == 1)
15306 return Node;
15307 // Use an arbitrary dmask - required for the instruction to work
15308 NewDmask = 1;
15309 }
15310 // Abort if there's no change
15311 if (NewDmask == OldDmask)
15312 return Node;
15313
15314 unsigned BitsSet = llvm::popcount(NewDmask);
15315
15316 // Check for TFE or LWE - increase the number of channels by one to account
15317 // for the extra return value
15318 // This will need adjustment for D16 if this is also included in
15319 // adjustWriteMask (this function) but at present D16 are excluded.
15320 unsigned NewChannels = BitsSet + UsesTFC;
15321
15322 int NewOpcode =
15323 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15324 assert(NewOpcode != -1 &&
15325 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15326 "failed to find equivalent MIMG op");
15327
15328 // Adjust the writemask in the node
15330 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15331 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15332 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15333
15334 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15335
15336 MVT ResultVT = NewChannels == 1
15337 ? SVT
15338 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
15339 : NewChannels == 5 ? 8
15340 : NewChannels);
15341 SDVTList NewVTList =
15342 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15343
15344 MachineSDNode *NewNode =
15345 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15346
15347 if (HasChain) {
15348 // Update chain.
15349 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15350 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15351 }
15352
15353 if (NewChannels == 1) {
15354 assert(Node->hasNUsesOfValue(1, 0));
15355 SDNode *Copy =
15356 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15357 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15358 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15359 return nullptr;
15360 }
15361
15362 // Update the users of the node with the new indices
15363 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15364 SDNode *User = Users[i];
15365 if (!User) {
15366 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15367 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15368 if (i || !NoChannels)
15369 continue;
15370 } else {
15371 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15372 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15373 if (NewUser != User) {
15374 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15375 DAG.RemoveDeadNode(User);
15376 }
15377 }
15378
15379 switch (Idx) {
15380 default:
15381 break;
15382 case AMDGPU::sub0:
15383 Idx = AMDGPU::sub1;
15384 break;
15385 case AMDGPU::sub1:
15386 Idx = AMDGPU::sub2;
15387 break;
15388 case AMDGPU::sub2:
15389 Idx = AMDGPU::sub3;
15390 break;
15391 case AMDGPU::sub3:
15392 Idx = AMDGPU::sub4;
15393 break;
15394 }
15395 }
15396
15397 DAG.RemoveDeadNode(Node);
15398 return nullptr;
15399}
15400
15402 if (Op.getOpcode() == ISD::AssertZext)
15403 Op = Op.getOperand(0);
15404
15405 return isa<FrameIndexSDNode>(Op);
15406}
15407
15408/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15409/// with frame index operands.
15410/// LLVM assumes that inputs are to these instructions are registers.
15411SDNode *
15413 SelectionDAG &DAG) const {
15414 if (Node->getOpcode() == ISD::CopyToReg) {
15415 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15416 SDValue SrcVal = Node->getOperand(2);
15417
15418 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15419 // to try understanding copies to physical registers.
15420 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15421 SDLoc SL(Node);
15423 SDValue VReg = DAG.getRegister(
15424 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15425
15426 SDNode *Glued = Node->getGluedNode();
15427 SDValue ToVReg = DAG.getCopyToReg(
15428 Node->getOperand(0), SL, VReg, SrcVal,
15429 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15430 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15431 VReg, ToVReg.getValue(1));
15432 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15433 DAG.RemoveDeadNode(Node);
15434 return ToResultReg.getNode();
15435 }
15436 }
15437
15439 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15440 if (!isFrameIndexOp(Node->getOperand(i))) {
15441 Ops.push_back(Node->getOperand(i));
15442 continue;
15443 }
15444
15445 SDLoc DL(Node);
15446 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15447 Node->getOperand(i).getValueType(),
15448 Node->getOperand(i)),
15449 0));
15450 }
15451
15452 return DAG.UpdateNodeOperands(Node, Ops);
15453}
15454
15455/// Fold the instructions after selecting them.
15456/// Returns null if users were already updated.
15458 SelectionDAG &DAG) const {
15460 unsigned Opcode = Node->getMachineOpcode();
15461
15462 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15463 !TII->isGather4(Opcode) &&
15464 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15465 return adjustWritemask(Node, DAG);
15466 }
15467
15468 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15470 return Node;
15471 }
15472
15473 switch (Opcode) {
15474 case AMDGPU::V_DIV_SCALE_F32_e64:
15475 case AMDGPU::V_DIV_SCALE_F64_e64: {
15476 // Satisfy the operand register constraint when one of the inputs is
15477 // undefined. Ordinarily each undef value will have its own implicit_def of
15478 // a vreg, so force these to use a single register.
15479 SDValue Src0 = Node->getOperand(1);
15480 SDValue Src1 = Node->getOperand(3);
15481 SDValue Src2 = Node->getOperand(5);
15482
15483 if ((Src0.isMachineOpcode() &&
15484 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15485 (Src0 == Src1 || Src0 == Src2))
15486 break;
15487
15488 MVT VT = Src0.getValueType().getSimpleVT();
15489 const TargetRegisterClass *RC =
15490 getRegClassFor(VT, Src0.getNode()->isDivergent());
15491
15493 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15494
15495 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
15496 Src0, SDValue());
15497
15498 // src0 must be the same register as src1 or src2, even if the value is
15499 // undefined, so make sure we don't violate this constraint.
15500 if (Src0.isMachineOpcode() &&
15501 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15502 if (Src1.isMachineOpcode() &&
15503 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15504 Src0 = Src1;
15505 else if (Src2.isMachineOpcode() &&
15506 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15507 Src0 = Src2;
15508 else {
15509 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15510 Src0 = UndefReg;
15511 Src1 = UndefReg;
15512 }
15513 } else
15514 break;
15515
15516 SmallVector<SDValue, 9> Ops(Node->ops());
15517 Ops[1] = Src0;
15518 Ops[3] = Src1;
15519 Ops[5] = Src2;
15520 Ops.push_back(ImpDef.getValue(1));
15521 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15522 }
15523 default:
15524 break;
15525 }
15526
15527 return Node;
15528}
15529
15530// Any MIMG instructions that use tfe or lwe require an initialization of the
15531// result register that will be written in the case of a memory access failure.
15532// The required code is also added to tie this init code to the result of the
15533// img instruction.
15536 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15537 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15538 MachineBasicBlock &MBB = *MI.getParent();
15539
15540 int DstIdx =
15541 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15542 unsigned InitIdx = 0;
15543
15544 if (TII->isImage(MI)) {
15545 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15546 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15547 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15548
15549 if (!TFE && !LWE) // intersect_ray
15550 return;
15551
15552 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15553 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15554 unsigned D16Val = D16 ? D16->getImm() : 0;
15555
15556 if (!TFEVal && !LWEVal)
15557 return;
15558
15559 // At least one of TFE or LWE are non-zero
15560 // We have to insert a suitable initialization of the result value and
15561 // tie this to the dest of the image instruction.
15562
15563 // Calculate which dword we have to initialize to 0.
15564 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15565
15566 // check that dmask operand is found.
15567 assert(MO_Dmask && "Expected dmask operand in instruction");
15568
15569 unsigned dmask = MO_Dmask->getImm();
15570 // Determine the number of active lanes taking into account the
15571 // Gather4 special case
15572 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15573
15574 bool Packed = !Subtarget->hasUnpackedD16VMem();
15575
15576 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15577
15578 // Abandon attempt if the dst size isn't large enough
15579 // - this is in fact an error but this is picked up elsewhere and
15580 // reported correctly.
15581 uint32_t DstSize =
15582 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15583 if (DstSize < InitIdx)
15584 return;
15585 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15586 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15587 } else {
15588 return;
15589 }
15590
15591 const DebugLoc &DL = MI.getDebugLoc();
15592
15593 // Create a register for the initialization value.
15594 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15595 unsigned NewDst = 0; // Final initialized value will be in here
15596
15597 // If PRTStrictNull feature is enabled (the default) then initialize
15598 // all the result registers to 0, otherwise just the error indication
15599 // register (VGPRn+1)
15600 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15601 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15602
15603 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15604 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15605 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15606 // Initialize dword
15607 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15608 // clang-format off
15609 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15610 .addImm(0);
15611 // clang-format on
15612 // Insert into the super-reg
15613 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15614 .addReg(PrevDst)
15615 .addReg(SubReg)
15617
15618 PrevDst = NewDst;
15619 }
15620
15621 // Add as an implicit operand
15622 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15623
15624 // Tie the just added implicit operand to the dst
15625 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15626}
15627
15628/// Assign the register class depending on the number of
15629/// bits set in the writemask
15631 SDNode *Node) const {
15633
15634 MachineFunction *MF = MI.getParent()->getParent();
15637
15638 if (TII->isVOP3(MI.getOpcode())) {
15639 // Make sure constant bus requirements are respected.
15640 TII->legalizeOperandsVOP3(MRI, MI);
15641
15642 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15643 // This saves a chain-copy of registers and better balance register
15644 // use between vgpr and agpr as agpr tuples tend to be big.
15645 if (!MI.getDesc().operands().empty()) {
15646 unsigned Opc = MI.getOpcode();
15647 bool HasAGPRs = Info->mayNeedAGPRs();
15648 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15649 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15650 for (auto I :
15651 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15652 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15653 if (I == -1)
15654 break;
15655 if ((I == Src2Idx) && (HasAGPRs))
15656 break;
15657 MachineOperand &Op = MI.getOperand(I);
15658 if (!Op.isReg() || !Op.getReg().isVirtual())
15659 continue;
15660 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15661 if (!TRI->hasAGPRs(RC))
15662 continue;
15663 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15664 if (!Src || !Src->isCopy() ||
15665 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15666 continue;
15667 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15668 // All uses of agpr64 and agpr32 can also accept vgpr except for
15669 // v_accvgpr_read, but we do not produce agpr reads during selection,
15670 // so no use checks are needed.
15671 MRI.setRegClass(Op.getReg(), NewRC);
15672 }
15673
15674 if (TII->isMAI(MI)) {
15675 // The ordinary src0, src1, src2 were legalized above.
15676 //
15677 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15678 // as a separate instruction.
15679 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15680 AMDGPU::OpName::scale_src0);
15681 if (Src0Idx != -1) {
15682 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15683 AMDGPU::OpName::scale_src1);
15684 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
15685 TII->usesConstantBus(MRI, MI, Src1Idx))
15686 TII->legalizeOpWithMove(MI, Src1Idx);
15687 }
15688 }
15689
15690 if (!HasAGPRs)
15691 return;
15692
15693 // Resolve the rest of AV operands to AGPRs.
15694 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15695 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15696 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15697 if (TRI->isVectorSuperClass(RC)) {
15698 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15699 MRI.setRegClass(Src2->getReg(), NewRC);
15700 if (Src2->isTied())
15701 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15702 }
15703 }
15704 }
15705 }
15706
15707 return;
15708 }
15709
15710 if (TII->isImage(MI))
15711 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15712}
15713
15715 uint64_t Val) {
15716 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15717 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15718}
15719
15721 const SDLoc &DL,
15722 SDValue Ptr) const {
15724
15725 // Build the half of the subregister with the constants before building the
15726 // full 128-bit register. If we are building multiple resource descriptors,
15727 // this will allow CSEing of the 2-component register.
15728 const SDValue Ops0[] = {
15729 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15730 buildSMovImm32(DAG, DL, 0),
15731 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15732 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15733 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
15734
15735 SDValue SubRegHi = SDValue(
15736 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
15737
15738 // Combine the constants and the pointer.
15739 const SDValue Ops1[] = {
15740 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
15741 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
15742 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
15743
15744 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15745}
15746
15747/// Return a resource descriptor with the 'Add TID' bit enabled
15748/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15749/// of the resource descriptor) to create an offset, which is added to
15750/// the resource pointer.
15752 SDValue Ptr, uint32_t RsrcDword1,
15753 uint64_t RsrcDword2And3) const {
15754 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15755 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15756 if (RsrcDword1) {
15757 PtrHi =
15758 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15759 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15760 0);
15761 }
15762
15763 SDValue DataLo =
15764 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15765 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15766
15767 const SDValue Ops[] = {
15768 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15769 PtrLo,
15770 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15771 PtrHi,
15772 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15773 DataLo,
15774 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15775 DataHi,
15776 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
15777
15778 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15779}
15780
15781//===----------------------------------------------------------------------===//
15782// SI Inline Assembly Support
15783//===----------------------------------------------------------------------===//
15784
15785std::pair<unsigned, const TargetRegisterClass *>
15787 StringRef Constraint,
15788 MVT VT) const {
15789 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15790
15791 const TargetRegisterClass *RC = nullptr;
15792 if (Constraint.size() == 1) {
15793 const unsigned BitWidth = VT.getSizeInBits();
15794 switch (Constraint[0]) {
15795 default:
15796 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15797 case 's':
15798 case 'r':
15799 switch (BitWidth) {
15800 case 16:
15801 RC = &AMDGPU::SReg_32RegClass;
15802 break;
15803 case 64:
15804 RC = &AMDGPU::SGPR_64RegClass;
15805 break;
15806 default:
15808 if (!RC)
15809 return std::pair(0U, nullptr);
15810 break;
15811 }
15812 break;
15813 case 'v':
15814 switch (BitWidth) {
15815 case 16:
15816 RC = &AMDGPU::VGPR_32RegClass;
15817 break;
15818 default:
15819 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15820 if (!RC)
15821 return std::pair(0U, nullptr);
15822 break;
15823 }
15824 break;
15825 case 'a':
15826 if (!Subtarget->hasMAIInsts())
15827 break;
15828 switch (BitWidth) {
15829 case 16:
15830 RC = &AMDGPU::AGPR_32RegClass;
15831 break;
15832 default:
15833 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15834 if (!RC)
15835 return std::pair(0U, nullptr);
15836 break;
15837 }
15838 break;
15839 }
15840 // We actually support i128, i16 and f16 as inline parameters
15841 // even if they are not reported as legal
15842 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15843 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15844 return std::pair(0U, RC);
15845 }
15846
15847 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15848 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15849 if (RegName.consume_front("v")) {
15850 RC = &AMDGPU::VGPR_32RegClass;
15851 } else if (RegName.consume_front("s")) {
15852 RC = &AMDGPU::SGPR_32RegClass;
15853 } else if (RegName.consume_front("a")) {
15854 RC = &AMDGPU::AGPR_32RegClass;
15855 }
15856
15857 if (RC) {
15858 uint32_t Idx;
15859 if (RegName.consume_front("[")) {
15860 uint32_t End;
15861 bool Failed = RegName.consumeInteger(10, Idx);
15862 Failed |= !RegName.consume_front(":");
15863 Failed |= RegName.consumeInteger(10, End);
15864 Failed |= !RegName.consume_back("]");
15865 if (!Failed) {
15866 uint32_t Width = (End - Idx + 1) * 32;
15867 // Prohibit constraints for register ranges with a width that does not
15868 // match the required type.
15869 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
15870 return std::pair(0U, nullptr);
15871 MCRegister Reg = RC->getRegister(Idx);
15873 RC = TRI->getVGPRClassForBitWidth(Width);
15874 else if (SIRegisterInfo::isSGPRClass(RC))
15875 RC = TRI->getSGPRClassForBitWidth(Width);
15876 else if (SIRegisterInfo::isAGPRClass(RC))
15877 RC = TRI->getAGPRClassForBitWidth(Width);
15878 if (RC) {
15879 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15880 return std::pair(Reg, RC);
15881 }
15882 }
15883 } else {
15884 // Check for lossy scalar/vector conversions.
15885 if (VT.isVector() && VT.getSizeInBits() != 32)
15886 return std::pair(0U, nullptr);
15887 bool Failed = RegName.getAsInteger(10, Idx);
15888 if (!Failed && Idx < RC->getNumRegs())
15889 return std::pair(RC->getRegister(Idx), RC);
15890 }
15891 }
15892 }
15893
15894 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15895 if (Ret.first)
15896 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15897
15898 return Ret;
15899}
15900
15901static bool isImmConstraint(StringRef Constraint) {
15902 if (Constraint.size() == 1) {
15903 switch (Constraint[0]) {
15904 default:
15905 break;
15906 case 'I':
15907 case 'J':
15908 case 'A':
15909 case 'B':
15910 case 'C':
15911 return true;
15912 }
15913 } else if (Constraint == "DA" || Constraint == "DB") {
15914 return true;
15915 }
15916 return false;
15917}
15918
15921 if (Constraint.size() == 1) {
15922 switch (Constraint[0]) {
15923 default:
15924 break;
15925 case 's':
15926 case 'v':
15927 case 'a':
15928 return C_RegisterClass;
15929 }
15930 }
15931 if (isImmConstraint(Constraint)) {
15932 return C_Other;
15933 }
15934 return TargetLowering::getConstraintType(Constraint);
15935}
15936
15937static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15939 Val = Val & maskTrailingOnes<uint64_t>(Size);
15940 }
15941 return Val;
15942}
15943
15945 StringRef Constraint,
15946 std::vector<SDValue> &Ops,
15947 SelectionDAG &DAG) const {
15948 if (isImmConstraint(Constraint)) {
15949 uint64_t Val;
15950 if (getAsmOperandConstVal(Op, Val) &&
15951 checkAsmConstraintVal(Op, Constraint, Val)) {
15952 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15953 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15954 }
15955 } else {
15956 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15957 }
15958}
15959
15961 unsigned Size = Op.getScalarValueSizeInBits();
15962 if (Size > 64)
15963 return false;
15964
15965 if (Size == 16 && !Subtarget->has16BitInsts())
15966 return false;
15967
15968 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15969 Val = C->getSExtValue();
15970 return true;
15971 }
15972 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15973 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15974 return true;
15975 }
15976 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15977 if (Size != 16 || Op.getNumOperands() != 2)
15978 return false;
15979 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15980 return false;
15981 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15982 Val = C->getSExtValue();
15983 return true;
15984 }
15985 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15986 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15987 return true;
15988 }
15989 }
15990
15991 return false;
15992}
15993
15995 uint64_t Val) const {
15996 if (Constraint.size() == 1) {
15997 switch (Constraint[0]) {
15998 case 'I':
16000 case 'J':
16001 return isInt<16>(Val);
16002 case 'A':
16003 return checkAsmConstraintValA(Op, Val);
16004 case 'B':
16005 return isInt<32>(Val);
16006 case 'C':
16007 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
16009 default:
16010 break;
16011 }
16012 } else if (Constraint.size() == 2) {
16013 if (Constraint == "DA") {
16014 int64_t HiBits = static_cast<int32_t>(Val >> 32);
16015 int64_t LoBits = static_cast<int32_t>(Val);
16016 return checkAsmConstraintValA(Op, HiBits, 32) &&
16017 checkAsmConstraintValA(Op, LoBits, 32);
16018 }
16019 if (Constraint == "DB") {
16020 return true;
16021 }
16022 }
16023 llvm_unreachable("Invalid asm constraint");
16024}
16025
16027 unsigned MaxSize) const {
16028 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
16029 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
16030 if (Size == 16) {
16031 MVT VT = Op.getSimpleValueType();
16032 switch (VT.SimpleTy) {
16033 default:
16034 return false;
16035 case MVT::i16:
16036 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
16037 case MVT::f16:
16038 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
16039 case MVT::bf16:
16040 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
16041 case MVT::v2i16:
16042 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
16043 case MVT::v2f16:
16044 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
16045 case MVT::v2bf16:
16046 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
16047 }
16048 }
16049 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
16050 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
16051 return true;
16052 return false;
16053}
16054
16055static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16056 switch (UnalignedClassID) {
16057 case AMDGPU::VReg_64RegClassID:
16058 return AMDGPU::VReg_64_Align2RegClassID;
16059 case AMDGPU::VReg_96RegClassID:
16060 return AMDGPU::VReg_96_Align2RegClassID;
16061 case AMDGPU::VReg_128RegClassID:
16062 return AMDGPU::VReg_128_Align2RegClassID;
16063 case AMDGPU::VReg_160RegClassID:
16064 return AMDGPU::VReg_160_Align2RegClassID;
16065 case AMDGPU::VReg_192RegClassID:
16066 return AMDGPU::VReg_192_Align2RegClassID;
16067 case AMDGPU::VReg_224RegClassID:
16068 return AMDGPU::VReg_224_Align2RegClassID;
16069 case AMDGPU::VReg_256RegClassID:
16070 return AMDGPU::VReg_256_Align2RegClassID;
16071 case AMDGPU::VReg_288RegClassID:
16072 return AMDGPU::VReg_288_Align2RegClassID;
16073 case AMDGPU::VReg_320RegClassID:
16074 return AMDGPU::VReg_320_Align2RegClassID;
16075 case AMDGPU::VReg_352RegClassID:
16076 return AMDGPU::VReg_352_Align2RegClassID;
16077 case AMDGPU::VReg_384RegClassID:
16078 return AMDGPU::VReg_384_Align2RegClassID;
16079 case AMDGPU::VReg_512RegClassID:
16080 return AMDGPU::VReg_512_Align2RegClassID;
16081 case AMDGPU::VReg_1024RegClassID:
16082 return AMDGPU::VReg_1024_Align2RegClassID;
16083 case AMDGPU::AReg_64RegClassID:
16084 return AMDGPU::AReg_64_Align2RegClassID;
16085 case AMDGPU::AReg_96RegClassID:
16086 return AMDGPU::AReg_96_Align2RegClassID;
16087 case AMDGPU::AReg_128RegClassID:
16088 return AMDGPU::AReg_128_Align2RegClassID;
16089 case AMDGPU::AReg_160RegClassID:
16090 return AMDGPU::AReg_160_Align2RegClassID;
16091 case AMDGPU::AReg_192RegClassID:
16092 return AMDGPU::AReg_192_Align2RegClassID;
16093 case AMDGPU::AReg_256RegClassID:
16094 return AMDGPU::AReg_256_Align2RegClassID;
16095 case AMDGPU::AReg_512RegClassID:
16096 return AMDGPU::AReg_512_Align2RegClassID;
16097 case AMDGPU::AReg_1024RegClassID:
16098 return AMDGPU::AReg_1024_Align2RegClassID;
16099 default:
16100 return -1;
16101 }
16102}
16103
16104// Figure out which registers should be reserved for stack access. Only after
16105// the function is legalized do we know all of the non-spill stack objects or if
16106// calls are present.
16110 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16111 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16112 const SIInstrInfo *TII = ST.getInstrInfo();
16113
16114 if (Info->isEntryFunction()) {
16115 // Callable functions have fixed registers used for stack access.
16117 }
16118
16119 // TODO: Move this logic to getReservedRegs()
16120 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16121 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16122 Register SReg = ST.isWave32()
16123 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16124 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16125 &AMDGPU::SGPR_64RegClass);
16126 Info->setSGPRForEXECCopy(SReg);
16127
16128 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16129 Info->getStackPtrOffsetReg()));
16130 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16131 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16132
16133 // We need to worry about replacing the default register with itself in case
16134 // of MIR testcases missing the MFI.
16135 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16136 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16137
16138 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16139 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16140
16141 Info->limitOccupancy(MF);
16142
16143 if (ST.isWave32() && !MF.empty()) {
16144 for (auto &MBB : MF) {
16145 for (auto &MI : MBB) {
16146 TII->fixImplicitOperands(MI);
16147 }
16148 }
16149 }
16150
16151 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16152 // classes if required. Ideally the register class constraints would differ
16153 // per-subtarget, but there's no easy way to achieve that right now. This is
16154 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16155 // from using them as the register class for legal types.
16156 if (ST.needsAlignedVGPRs()) {
16157 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16158 const Register Reg = Register::index2VirtReg(I);
16159 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16160 if (!RC)
16161 continue;
16162 int NewClassID = getAlignedAGPRClassID(RC->getID());
16163 if (NewClassID != -1)
16164 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16165 }
16166 }
16167
16169}
16170
16172 KnownBits &Known,
16173 const APInt &DemandedElts,
16174 const SelectionDAG &DAG,
16175 unsigned Depth) const {
16176 Known.resetAll();
16177 unsigned Opc = Op.getOpcode();
16178 switch (Opc) {
16180 unsigned IID = Op.getConstantOperandVal(0);
16181 switch (IID) {
16182 case Intrinsic::amdgcn_mbcnt_lo:
16183 case Intrinsic::amdgcn_mbcnt_hi: {
16184 const GCNSubtarget &ST =
16186 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16187 // most 31 + src1.
16188 Known.Zero.setBitsFrom(
16189 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16190 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16191 Known = KnownBits::add(Known, Known2);
16192 return;
16193 }
16194 }
16195 break;
16196 }
16197 }
16199 Op, Known, DemandedElts, DAG, Depth);
16200}
16201
16203 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16205
16206 // Set the high bits to zero based on the maximum allowed scratch size per
16207 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16208 // calculation won't overflow, so assume the sign bit is never set.
16209 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16210}
16211
16213 KnownBits &Known, unsigned Dim) {
16214 unsigned MaxValue =
16215 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
16216 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16217}
16218
16220 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
16221 const MachineRegisterInfo &MRI, unsigned Depth) const {
16222 const MachineInstr *MI = MRI.getVRegDef(R);
16223 switch (MI->getOpcode()) {
16224 case AMDGPU::G_INTRINSIC:
16225 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16226 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16227 switch (IID) {
16228 case Intrinsic::amdgcn_workitem_id_x:
16229 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
16230 break;
16231 case Intrinsic::amdgcn_workitem_id_y:
16232 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
16233 break;
16234 case Intrinsic::amdgcn_workitem_id_z:
16235 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
16236 break;
16237 case Intrinsic::amdgcn_mbcnt_lo:
16238 case Intrinsic::amdgcn_mbcnt_hi: {
16239 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16240 // most 31 + src1.
16241 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16242 ? getSubtarget()->getWavefrontSizeLog2()
16243 : 5);
16244 KnownBits Known2;
16245 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16246 Depth + 1);
16247 Known = KnownBits::add(Known, Known2);
16248 break;
16249 }
16250 case Intrinsic::amdgcn_groupstaticsize: {
16251 // We can report everything over the maximum size as 0. We can't report
16252 // based on the actual size because we don't know if it's accurate or not
16253 // at any given point.
16254 Known.Zero.setHighBits(
16255 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16256 break;
16257 }
16258 }
16259 break;
16260 }
16261 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16262 Known.Zero.setHighBits(24);
16263 break;
16264 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16265 Known.Zero.setHighBits(16);
16266 break;
16267 case AMDGPU::G_AMDGPU_SMED3:
16268 case AMDGPU::G_AMDGPU_UMED3: {
16269 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16270
16271 KnownBits Known2;
16272 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16273 if (Known2.isUnknown())
16274 break;
16275
16276 KnownBits Known1;
16277 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16278 if (Known1.isUnknown())
16279 break;
16280
16281 KnownBits Known0;
16282 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16283 if (Known0.isUnknown())
16284 break;
16285
16286 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16287 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16288 Known.One = Known0.One & Known1.One & Known2.One;
16289 break;
16290 }
16291 }
16292}
16293
16296 unsigned Depth) const {
16297 const MachineInstr *MI = MRI.getVRegDef(R);
16298 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16299 // FIXME: Can this move to generic code? What about the case where the call
16300 // site specifies a lower alignment?
16301 Intrinsic::ID IID = GI->getIntrinsicID();
16303 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
16304 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16305 return *RetAlign;
16306 }
16307 return Align(1);
16308}
16309
16312 const Align CacheLineAlign = Align(64);
16313
16314 // Pre-GFX10 target did not benefit from loop alignment
16315 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16316 getSubtarget()->hasInstFwdPrefetchBug())
16317 return PrefAlign;
16318
16319 // On GFX10 I$ is 4 x 64 bytes cache lines.
16320 // By default prefetcher keeps one cache line behind and reads two ahead.
16321 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16322 // behind and one ahead.
16323 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16324 // If loop fits 64 bytes it always spans no more than two cache lines and
16325 // does not need an alignment.
16326 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16327 // Else if loop is less or equal 192 bytes we need two lines behind.
16328
16330 const MachineBasicBlock *Header = ML->getHeader();
16331 if (Header->getAlignment() != PrefAlign)
16332 return Header->getAlignment(); // Already processed.
16333
16334 unsigned LoopSize = 0;
16335 for (const MachineBasicBlock *MBB : ML->blocks()) {
16336 // If inner loop block is aligned assume in average half of the alignment
16337 // size to be added as nops.
16338 if (MBB != Header)
16339 LoopSize += MBB->getAlignment().value() / 2;
16340
16341 for (const MachineInstr &MI : *MBB) {
16342 LoopSize += TII->getInstSizeInBytes(MI);
16343 if (LoopSize > 192)
16344 return PrefAlign;
16345 }
16346 }
16347
16348 if (LoopSize <= 64)
16349 return PrefAlign;
16350
16351 if (LoopSize <= 128)
16352 return CacheLineAlign;
16353
16354 // If any of parent loops is surrounded by prefetch instructions do not
16355 // insert new for inner loop, which would reset parent's settings.
16356 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16357 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16358 auto I = Exit->getFirstNonDebugInstr();
16359 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16360 return CacheLineAlign;
16361 }
16362 }
16363
16364 MachineBasicBlock *Pre = ML->getLoopPreheader();
16365 MachineBasicBlock *Exit = ML->getExitBlock();
16366
16367 if (Pre && Exit) {
16368 auto PreTerm = Pre->getFirstTerminator();
16369 if (PreTerm == Pre->begin() ||
16370 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16371 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16372 .addImm(1); // prefetch 2 lines behind PC
16373
16374 auto ExitHead = Exit->getFirstNonDebugInstr();
16375 if (ExitHead == Exit->end() ||
16376 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16377 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16378 .addImm(2); // prefetch 1 line behind PC
16379 }
16380
16381 return CacheLineAlign;
16382}
16383
16385static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16386 assert(N->getOpcode() == ISD::CopyFromReg);
16387 do {
16388 // Follow the chain until we find an INLINEASM node.
16389 N = N->getOperand(0).getNode();
16390 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
16391 return true;
16392 } while (N->getOpcode() == ISD::CopyFromReg);
16393 return false;
16394}
16395
16398 UniformityInfo *UA) const {
16399 switch (N->getOpcode()) {
16400 case ISD::CopyFromReg: {
16401 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16402 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16403 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16404 Register Reg = R->getReg();
16405
16406 // FIXME: Why does this need to consider isLiveIn?
16407 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16408 return !TRI->isSGPRReg(MRI, Reg);
16409
16410 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16411 return UA->isDivergent(V);
16412
16413 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16414 return !TRI->isSGPRReg(MRI, Reg);
16415 }
16416 case ISD::LOAD: {
16417 const LoadSDNode *L = cast<LoadSDNode>(N);
16418 unsigned AS = L->getAddressSpace();
16419 // A flat load may access private memory.
16421 }
16422 case ISD::CALLSEQ_END:
16423 return true;
16425 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16427 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16446 // Target-specific read-modify-write atomics are sources of divergence.
16447 return true;
16448 default:
16449 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16450 // Generic read-modify-write atomics are sources of divergence.
16451 return A->readMem() && A->writeMem();
16452 }
16453 return false;
16454 }
16455}
16456
16458 EVT VT) const {
16459 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16460 case MVT::f32:
16462 case MVT::f64:
16463 case MVT::f16:
16465 default:
16466 return false;
16467 }
16468}
16469
16471 LLT Ty, const MachineFunction &MF) const {
16472 switch (Ty.getScalarSizeInBits()) {
16473 case 32:
16474 return !denormalModeIsFlushAllF32(MF);
16475 case 64:
16476 case 16:
16477 return !denormalModeIsFlushAllF64F16(MF);
16478 default:
16479 return false;
16480 }
16481}
16482
16484 const SelectionDAG &DAG,
16485 bool SNaN,
16486 unsigned Depth) const {
16487 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16488 const MachineFunction &MF = DAG.getMachineFunction();
16490
16491 if (Info->getMode().DX10Clamp)
16492 return true; // Clamped to 0.
16493 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16494 }
16495
16497 Depth);
16498}
16499
16500// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16501// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16503 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16504 return true;
16505
16507 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16508 if (DenormMode == DenormalMode::getPreserveSign())
16509 return true;
16510
16511 // TODO: Remove this.
16512 return RMW->getFunction()
16513 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16514 .getValueAsBool();
16515}
16516
16518 LLVMContext &Ctx = RMW->getContext();
16519 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16520 StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16521
16522 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16523 << "Hardware instruction generated for atomic "
16524 << RMW->getOperationName(RMW->getOperation())
16525 << " operation at memory scope " << MemScope;
16526}
16527
16528static bool isV2F16OrV2BF16(Type *Ty) {
16529 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16530 Type *EltTy = VT->getElementType();
16531 return VT->getNumElements() == 2 &&
16532 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16533 }
16534
16535 return false;
16536}
16537
16538static bool isV2F16(Type *Ty) {
16539 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16540 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16541}
16542
16543static bool isV2BF16(Type *Ty) {
16544 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16545 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16546}
16547
16548/// \return true if atomicrmw integer ops work for the type.
16549static bool isAtomicRMWLegalIntTy(Type *Ty) {
16550 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16551 unsigned BW = IT->getBitWidth();
16552 return BW == 32 || BW == 64;
16553 }
16554
16555 return false;
16556}
16557
16558/// \return true if this atomicrmw xchg type can be selected.
16559static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16560 Type *Ty = RMW->getType();
16561 if (isAtomicRMWLegalIntTy(Ty))
16562 return true;
16563
16564 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16565 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16566 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16567 return BW == 32 || BW == 64;
16568 }
16569
16570 if (Ty->isFloatTy() || Ty->isDoubleTy())
16571 return true;
16572
16573 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16574 return VT->getNumElements() == 2 &&
16575 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16576 }
16577
16578 return false;
16579}
16580
16581/// \returns true if it's valid to emit a native instruction for \p RMW, based
16582/// on the properties of the target memory.
16583static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16584 const AtomicRMWInst *RMW,
16585 bool HasSystemScope) {
16586 // The remote/fine-grained access logic is different from the integer
16587 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16588 // fine-grained access does not work, even for a device local allocation.
16589 //
16590 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16591 // allocations work.
16592 if (HasSystemScope) {
16594 RMW->hasMetadata("amdgpu.no.remote.memory"))
16595 return true;
16597 return true;
16598
16599 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16600}
16601
16602/// \return Action to perform on AtomicRMWInsts for integer operations.
16605 return isAtomicRMWLegalIntTy(RMW->getType())
16608}
16609
16610/// Return if a flat address space atomicrmw can access private memory.
16612 const MDNode *NoaliasAddrSpaceMD =
16613 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16614 if (!NoaliasAddrSpaceMD)
16615 return true;
16616
16617 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16618 ++I) {
16619 auto *Low = mdconst::extract<ConstantInt>(
16620 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16621 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16622 auto *High = mdconst::extract<ConstantInt>(
16623 NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16624 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16625 }
16626 }
16627
16628 return true;
16629}
16630
16633 unsigned AS = RMW->getPointerAddressSpace();
16634 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16636
16637 // 64-bit flat atomics that dynamically reside in private memory will silently
16638 // be dropped.
16639 //
16640 // Note that we will emit a new copy of the original atomic in the expansion,
16641 // which will be incrementally relegalized.
16642 const DataLayout &DL = RMW->getFunction()->getDataLayout();
16643 if (AS == AMDGPUAS::FLAT_ADDRESS &&
16644 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16647
16648 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16650 ORE.emit([=]() {
16651 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16652 });
16653 return Kind;
16654 };
16655
16656 auto SSID = RMW->getSyncScopeID();
16657 bool HasSystemScope =
16658 SSID == SyncScope::System ||
16659 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16660
16661 auto Op = RMW->getOperation();
16662 switch (Op) {
16663 case AtomicRMWInst::Xchg: {
16664 // PCIe supports add and xchg for system atomics.
16665 return isAtomicRMWLegalXChgTy(RMW)
16668 }
16669 case AtomicRMWInst::Add:
16670 case AtomicRMWInst::And:
16674 case AtomicRMWInst::Sub:
16675 case AtomicRMWInst::Or:
16676 case AtomicRMWInst::Xor: {
16677 // Atomic sub/or/xor do not work over PCI express, but atomic add
16678 // does. InstCombine transforms these with 0 to or, so undo that.
16679 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16680 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16681 ConstVal && ConstVal->isNullValue())
16683 }
16684
16686 }
16687 case AtomicRMWInst::FAdd: {
16688 Type *Ty = RMW->getType();
16689
16690 // TODO: Handle REGION_ADDRESS
16691 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16692 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16693 // is fixed to round-to-nearest-even.
16694 //
16695 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16696 // round-to-nearest-even.
16697 //
16698 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16699 // suggests it is OK if the floating-point mode may not match the calling
16700 // thread.
16701 if (Ty->isFloatTy()) {
16704 }
16705
16706 if (Ty->isDoubleTy()) {
16707 // Ignores denormal mode, but we don't consider flushing mandatory.
16710 }
16711
16712 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16714
16716 }
16717
16718 // LDS atomics respect the denormal mode from the mode register.
16719 //
16720 // Traditionally f32 global/buffer memory atomics would unconditionally
16721 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16722 // flush.
16723 //
16724 // On targets with flat atomic fadd, denormals would flush depending on
16725 // whether the target address resides in LDS or global memory. We consider
16726 // this flat-maybe-flush as will-flush.
16727 if (Ty->isFloatTy() &&
16731
16732 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16733 // safe. The message phrasing also should be better.
16734 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16735 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16736 // gfx940, gfx12
16737 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16738 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16739 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16740 // gfx90a, gfx940, gfx12
16741 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16742 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16743
16744 // gfx940, gfx12
16745 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16746 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16747 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16748 // gfx90a, gfx940, gfx12
16749 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16750 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16751
16752 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16753 // buffer. gfx12 does have the buffer version.
16754 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16755 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16756 }
16757
16758 // global and flat atomic fadd f64: gfx90a, gfx940.
16759 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16760 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16761
16762 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16763 if (Ty->isFloatTy()) {
16764 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16765 // gfx11+.
16766 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16767 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16768 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16769 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16770 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16771 } else {
16772 // gfx908
16773 if (RMW->use_empty() &&
16775 isV2F16(Ty))
16776 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16777 }
16778 }
16779
16780 // flat atomic fadd f32: gfx940, gfx11+.
16781 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16782 if (Subtarget->hasFlatAtomicFaddF32Inst())
16783 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16784
16785 // If it is in flat address space, and the type is float, we will try to
16786 // expand it, if the target supports global and lds atomic fadd. The
16787 // reason we need that is, in the expansion, we emit the check of
16788 // address space. If it is in global address space, we emit the global
16789 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16790 // fadd.
16791 if (Subtarget->hasLDSFPAtomicAddF32()) {
16792 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16794 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16796 }
16797 }
16798 }
16799
16801 }
16803 case AtomicRMWInst::FMax: {
16804 Type *Ty = RMW->getType();
16805
16806 // LDS float and double fmin/fmax were always supported.
16807 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16808 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16810 }
16811
16812 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16813 // For flat and global cases:
16814 // float, double in gfx7. Manual claims denormal support.
16815 // Removed in gfx8.
16816 // float, double restored in gfx10.
16817 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16818 //
16819 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16820 // no f32.
16821 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16822 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16823 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16824 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16825 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16826 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16828 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16829 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16830 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16831 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16832 }
16833 }
16834
16836 }
16837 case AtomicRMWInst::Min:
16838 case AtomicRMWInst::Max:
16840 case AtomicRMWInst::UMax: {
16843 // Always expand system scope min/max atomics.
16844 if (HasSystemScope)
16846 }
16847
16849 }
16852 default:
16854 }
16855
16856 llvm_unreachable("covered atomicrmw op switch");
16857}
16858
16864}
16865
16868 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16871}
16872
16875 unsigned AddrSpace = CmpX->getPointerAddressSpace();
16876 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16878
16879 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16881
16882 const DataLayout &DL = CmpX->getDataLayout();
16883
16884 Type *ValTy = CmpX->getNewValOperand()->getType();
16885
16886 // If a 64-bit flat atomic may alias private, we need to avoid using the
16887 // atomic in the private case.
16888 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16890}
16891
16892const TargetRegisterClass *
16893SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16895 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16896 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16897 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
16898 : &AMDGPU::SReg_32RegClass;
16899 if (!TRI->isSGPRClass(RC) && !isDivergent)
16900 return TRI->getEquivalentSGPRClass(RC);
16901 if (TRI->isSGPRClass(RC) && isDivergent)
16902 return TRI->getEquivalentVGPRClass(RC);
16903
16904 return RC;
16905}
16906
16907// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16908// uniform values (as produced by the mask results of control flow intrinsics)
16909// used outside of divergent blocks. The phi users need to also be treated as
16910// always uniform.
16911//
16912// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16913static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16914 unsigned WaveSize) {
16915 // FIXME: We assume we never cast the mask results of a control flow
16916 // intrinsic.
16917 // Early exit if the type won't be consistent as a compile time hack.
16918 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16919 if (!IT || IT->getBitWidth() != WaveSize)
16920 return false;
16921
16922 if (!isa<Instruction>(V))
16923 return false;
16924 if (!Visited.insert(V).second)
16925 return false;
16926 bool Result = false;
16927 for (const auto *U : V->users()) {
16928 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16929 if (V == U->getOperand(1)) {
16930 switch (Intrinsic->getIntrinsicID()) {
16931 default:
16932 Result = false;
16933 break;
16934 case Intrinsic::amdgcn_if_break:
16935 case Intrinsic::amdgcn_if:
16936 case Intrinsic::amdgcn_else:
16937 Result = true;
16938 break;
16939 }
16940 }
16941 if (V == U->getOperand(0)) {
16942 switch (Intrinsic->getIntrinsicID()) {
16943 default:
16944 Result = false;
16945 break;
16946 case Intrinsic::amdgcn_end_cf:
16947 case Intrinsic::amdgcn_loop:
16948 Result = true;
16949 break;
16950 }
16951 }
16952 } else {
16953 Result = hasCFUser(U, Visited, WaveSize);
16954 }
16955 if (Result)
16956 break;
16957 }
16958 return Result;
16959}
16960
16962 const Value *V) const {
16963 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16964 if (CI->isInlineAsm()) {
16965 // FIXME: This cannot give a correct answer. This should only trigger in
16966 // the case where inline asm returns mixed SGPR and VGPR results, used
16967 // outside the defining block. We don't have a specific result to
16968 // consider, so this assumes if any value is SGPR, the overall register
16969 // also needs to be SGPR.
16970 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16972 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16973 for (auto &TC : TargetConstraints) {
16974 if (TC.Type == InlineAsm::isOutput) {
16976 const TargetRegisterClass *RC =
16977 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
16978 TC.ConstraintVT)
16979 .second;
16980 if (RC && SIRI->isSGPRClass(RC))
16981 return true;
16982 }
16983 }
16984 }
16985 }
16987 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16988}
16989
16991 for (SDUse &Use : N->uses()) {
16992 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
16993 if (getBasePtrIndex(M) == Use.getOperandNo())
16994 return true;
16995 }
16996 }
16997 return false;
16998}
16999
17001 SDValue N1) const {
17002 if (!N0.hasOneUse())
17003 return false;
17004 // Take care of the opportunity to keep N0 uniform
17005 if (N0->isDivergent() || !N1->isDivergent())
17006 return true;
17007 // Check if we have a good chance to form the memory access pattern with the
17008 // base and offset
17009 return (DAG.isBaseWithConstantOffset(N0) &&
17011}
17012
17014 Register N0, Register N1) const {
17015 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
17016}
17017
17020 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
17022 if (I.getMetadata("amdgpu.noclobber"))
17023 Flags |= MONoClobber;
17024 if (I.getMetadata("amdgpu.last.use"))
17025 Flags |= MOLastUse;
17026 return Flags;
17027}
17028
17030 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
17031 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
17032 if (User->getOpcode() != ISD::CopyToReg)
17033 return false;
17034 if (!Def->isMachineOpcode())
17035 return false;
17036 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
17037 if (!MDef)
17038 return false;
17039
17040 unsigned ResNo = User->getOperand(Op).getResNo();
17041 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
17042 return false;
17043 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
17044 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17045 PhysReg = AMDGPU::SCC;
17046 const TargetRegisterClass *RC =
17047 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17048 Cost = RC->getCopyCost();
17049 return true;
17050 }
17051 return false;
17052}
17053
17054/// Check if it is profitable to hoist instruction in then/else to if.
17056 if (!I->hasOneUse())
17057 return true;
17058
17059 Instruction *User = I->user_back();
17060 // TODO: Add more patterns that are not profitable to hoist and
17061 // handle modifiers such as fabs and fneg
17062 switch (I->getOpcode()) {
17063 case Instruction::FMul: {
17064 if (User->getOpcode() != Instruction::FSub &&
17065 User->getOpcode() != Instruction::FAdd)
17066 return true;
17067
17069
17070 return ((!I->hasAllowContract() || !User->hasAllowContract()) &&
17071 Options.AllowFPOpFusion != FPOpFusion::Fast &&
17072 !Options.UnsafeFPMath) ||
17073 !isFMAFasterThanFMulAndFAdd(*I->getFunction(), User->getType());
17074 }
17075 default:
17076 return true;
17077 }
17078 return true;
17079}
17080
17082 Instruction *AI) const {
17083 // Given: atomicrmw fadd ptr %addr, float %val ordering
17084 //
17085 // With this expansion we produce the following code:
17086 // [...]
17087 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17088 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17089 //
17090 // atomicrmw.shared:
17091 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17092 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17093 // float %val ordering
17094 // br label %atomicrmw.phi
17095 //
17096 // atomicrmw.check.private:
17097 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17098 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17099 //
17100 // atomicrmw.private:
17101 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17102 // %loaded.private = load float, ptr addrspace(5) %cast.private
17103 // %val.new = fadd float %loaded.private, %val
17104 // store float %val.new, ptr addrspace(5) %cast.private
17105 // br label %atomicrmw.phi
17106 //
17107 // atomicrmw.global:
17108 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17109 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17110 // float %val ordering
17111 // br label %atomicrmw.phi
17112 //
17113 // atomicrmw.phi:
17114 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17115 // [ %loaded.private, %atomicrmw.private ],
17116 // [ %loaded.global, %atomicrmw.global ]
17117 // br label %atomicrmw.end
17118 //
17119 // atomicrmw.end:
17120 // [...]
17121 //
17122 //
17123 // For 64-bit atomics which may reside in private memory, we perform a simpler
17124 // version that only inserts the private check, and uses the flat operation.
17125
17126 IRBuilder<> Builder(AI);
17127 LLVMContext &Ctx = Builder.getContext();
17128
17129 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17130 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17132 Value *Addr = AI->getOperand(PtrOpIdx);
17133
17134 /// TODO: Only need to check private, then emit flat-known-not private (no
17135 /// need for shared block, or cast to global).
17136 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17137
17138 Align Alignment;
17139 if (RMW)
17140 Alignment = RMW->getAlign();
17141 else if (CX)
17142 Alignment = CX->getAlign();
17143 else
17144 llvm_unreachable("unhandled atomic operation");
17145
17146 // FullFlatEmulation is true if we need to issue the private, shared, and
17147 // global cases.
17148 //
17149 // If this is false, we are only dealing with the flat-targeting-private case,
17150 // where we only insert a check for private and still use the flat instruction
17151 // for global and shared.
17152
17153 bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17154 Subtarget->hasAtomicFaddInsts() &&
17155 RMW->getType()->isFloatTy();
17156
17157 // If the return value isn't used, do not introduce a false use in the phi.
17158 bool ReturnValueIsUsed = !AI->use_empty();
17159
17160 BasicBlock *BB = Builder.GetInsertBlock();
17161 Function *F = BB->getParent();
17162 BasicBlock *ExitBB =
17163 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17164 BasicBlock *SharedBB = nullptr;
17165
17166 BasicBlock *CheckPrivateBB = BB;
17167 if (FullFlatEmulation) {
17168 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17169 CheckPrivateBB =
17170 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17171 }
17172
17173 BasicBlock *PrivateBB =
17174 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17175 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17176 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17177
17178 std::prev(BB->end())->eraseFromParent();
17179 Builder.SetInsertPoint(BB);
17180
17181 Value *LoadedShared = nullptr;
17182 if (FullFlatEmulation) {
17183 CallInst *IsShared = Builder.CreateIntrinsic(
17184 Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
17185 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17186 Builder.SetInsertPoint(SharedBB);
17187 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17189
17190 Instruction *Clone = AI->clone();
17191 Clone->insertInto(SharedBB, SharedBB->end());
17192 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17193 LoadedShared = Clone;
17194
17195 Builder.CreateBr(PhiBB);
17196 Builder.SetInsertPoint(CheckPrivateBB);
17197 }
17198
17199 CallInst *IsPrivate = Builder.CreateIntrinsic(
17200 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
17201 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17202
17203 Builder.SetInsertPoint(PrivateBB);
17204
17205 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17207
17208 Value *LoadedPrivate;
17209 if (RMW) {
17210 LoadedPrivate = Builder.CreateAlignedLoad(
17211 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17212
17213 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17214 LoadedPrivate, RMW->getValOperand());
17215
17216 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17217 } else {
17218 auto [ResultLoad, Equal] =
17219 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17220 CX->getNewValOperand(), CX->getAlign());
17221
17222 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17223 ResultLoad, 0);
17224 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17225 }
17226
17227 Builder.CreateBr(PhiBB);
17228
17229 Builder.SetInsertPoint(GlobalBB);
17230
17231 // Continue using a flat instruction if we only emitted the check for private.
17232 Instruction *LoadedGlobal = AI;
17233 if (FullFlatEmulation) {
17234 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17236 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17237 }
17238
17239 AI->removeFromParent();
17240 AI->insertInto(GlobalBB, GlobalBB->end());
17241
17242 // The new atomicrmw may go through another round of legalization later.
17243 if (!FullFlatEmulation) {
17244 // We inserted the runtime check already, make sure we do not try to
17245 // re-expand this.
17246 // TODO: Should union with any existing metadata.
17247 MDBuilder MDB(F->getContext());
17248 MDNode *RangeNotPrivate =
17251 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17252 RangeNotPrivate);
17253 }
17254
17255 Builder.CreateBr(PhiBB);
17256
17257 Builder.SetInsertPoint(PhiBB);
17258
17259 if (ReturnValueIsUsed) {
17260 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17261 AI->replaceAllUsesWith(Loaded);
17262 if (FullFlatEmulation)
17263 Loaded->addIncoming(LoadedShared, SharedBB);
17264 Loaded->addIncoming(LoadedPrivate, PrivateBB);
17265 Loaded->addIncoming(LoadedGlobal, GlobalBB);
17266 Loaded->takeName(AI);
17267 }
17268
17269 Builder.CreateBr(ExitBB);
17270}
17271
17274
17277 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17278 ConstVal && ConstVal->isNullValue()) {
17279 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17281
17282 // We may still need the private-alias-flat handling below.
17283
17284 // TODO: Skip this for cases where we cannot access remote memory.
17285 }
17286 }
17287
17288 // The non-flat expansions should only perform the de-canonicalization of
17289 // identity values.
17291 return;
17292
17294}
17295
17298}
17299
17300LoadInst *
17302 IRBuilder<> Builder(AI);
17303 auto Order = AI->getOrdering();
17304
17305 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17306 // must be flushed if the atomic ordering had a release semantics. This is
17307 // not necessary a fence, a release fence just coincides to do that flush.
17308 // Avoid replacing of an atomicrmw with a release semantics.
17309 if (isReleaseOrStronger(Order))
17310 return nullptr;
17311
17312 LoadInst *LI = Builder.CreateAlignedLoad(
17313 AI->getType(), AI->getPointerOperand(), AI->getAlign());
17314 LI->setAtomic(Order, AI->getSyncScopeID());
17315 LI->copyMetadata(*AI);
17316 LI->takeName(AI);
17317 AI->replaceAllUsesWith(LI);
17318 AI->eraseFromParent();
17319 return LI;
17320}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1214
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1211
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1122
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5463
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1489
bool isNegative() const
Definition: APFloat.h:1445
APInt bitcastToAPInt() const
Definition: APFloat.h:1351
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1100
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1081
bool isInfinity() const
Definition: APFloat.h:1442
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:349
const Function * getParent() const
Definition: Argument.h:43
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:640
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:544
static unsigned getPointerOperandIndex()
Definition: Instructions.h:631
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
static unsigned getPointerOperandIndex()
Definition: Instructions.h:872
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
void setOperation(BinOp Operation)
Definition: Instructions.h:821
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:878
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:378
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1451
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
bool isSigned() const
Definition: InstrTypes.h:928
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:807
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasPrefetch() const
Definition: GCNSubtarget.h:962
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:905
bool hasD16Images() const
Definition: GCNSubtarget.h:710
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:867
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:863
bool hasDot7Insts() const
Definition: GCNSubtarget.h:809
bool hasApertureRegs() const
Definition: GCNSubtarget.h:611
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:859
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:421
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:912
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:690
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:537
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:595
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasDot1Insts() const
Definition: GCNSubtarget.h:785
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:875
Align getStackAlignment() const
Definition: GCNSubtarget.h:975
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:465
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
bool hasMadF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:637
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:471
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:895
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:755
bool useDS128() const
Definition: GCNSubtarget.h:547
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:467
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:851
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:437
bool hasIntClamp() const
Definition: GCNSubtarget.h:367
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:645
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:744
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:346
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:942
bool hasFFBL() const
Definition: GCNSubtarget.h:425
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:569
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:855
bool hasMed3_16() const
Definition: GCNSubtarget.h:433
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:603
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:869
bool hasBFI() const
Definition: GCNSubtarget.h:413
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:587
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:821
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:532
bool hasFFBH() const
Definition: GCNSubtarget.h:429
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:871
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:879
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:891
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:877
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:899
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:557
bool hasDot8Insts() const
Definition: GCNSubtarget.h:813
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:552
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:883
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:742
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:887
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:441
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasFractBug() const
Definition: GCNSubtarget.h:405
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:409
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:725
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1164
LLVMContext & getContext() const
Definition: IRBuilder.h:195
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1158
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2157
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:368
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:218
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:176
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:261
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95
Metadata node.
Definition: Metadata.h:1073
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:750
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:982
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:801
const Pass * getPass() const
Definition: SelectionDAG.h:493
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:856
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:827
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:712
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:700
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:873
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:586
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:404
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:255
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void set(Value *Val)
Definition: Value.h:886
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1347
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1069
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1340
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1342
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1343
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1325
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1490
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1345
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1259
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1118
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1341
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1348
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1087
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1064
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1319
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1344
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1168
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1350
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1165
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1349
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1639
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1606
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1586
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
Definition: Intrinsics.cpp:746
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1609
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:55
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:246
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:557
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:287
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:156
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:161
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:237
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals