LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89
90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92
93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94
95 const SIRegisterInfo *TRI = STI.getRegisterInfo();
96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97
98 addRegisterClass(MVT::f64, V64RegClass);
99 addRegisterClass(MVT::v2f32, V64RegClass);
100 addRegisterClass(MVT::Untyped, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
230
231 // We only need to custom lower because we can't specify an action for bf16
232 // sources.
235 }
236
237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253
254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261
262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263
267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268
269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273
275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282 Expand);
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287 Expand);
288
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
292 Custom);
293
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297
299
301
303 Expand);
304
305#if 0
307#endif
308
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
311 for (MVT VT :
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321 switch (Op) {
322 case ISD::LOAD:
323 case ISD::STORE:
325 case ISD::BITCAST:
326 case ISD::UNDEF:
330 case ISD::IS_FPCLASS:
331 break;
336 break;
337 default:
339 break;
340 }
341 }
342 }
343
345
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
348
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363 }
364
365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377 }
378
379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391 }
392
393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405 }
406
407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419 }
420
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
423 Expand);
424
425 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
426 Custom);
427
428 // Avoid stack access for these.
429 // TODO: Generalize to more vector types.
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
433 Custom);
434
435 // Deal with vec3 vector operations when widened to vec4.
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
438
439 // Deal with vec5/6/7 vector operations when widened to vec8.
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
445 Custom);
446
447 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
448 // and output demarshalling
449 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
450
451 // We can't return success/failure, only the old value,
452 // let LLVM add the comparison
454 Expand);
455
456 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
457
458 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
459
460 // FIXME: This should be narrowed to i32, but that only happens if i64 is
461 // illegal.
462 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
463 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
464
465 // On SI this is s_memtime and s_memrealtime on VI.
467
468 if (Subtarget->hasSMemRealTime() ||
472
473 if (Subtarget->has16BitInsts()) {
476 } else {
478 }
479
480 if (Subtarget->hasMadMacF32Insts())
482
483 if (!Subtarget->hasBFI())
484 // fcopysign can be done in a single instruction with BFI.
485 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
486
487 if (!Subtarget->hasBCNT(32))
489
490 if (!Subtarget->hasBCNT(64))
492
493 if (Subtarget->hasFFBH())
495
496 if (Subtarget->hasFFBL())
498
499 // We only really have 32-bit BFE instructions (and 16-bit on VI).
500 //
501 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
502 // effort to match them now. We want this to be false for i64 cases when the
503 // extraction isn't restricted to the upper or lower half. Ideally we would
504 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
505 // span the midpoint are probably relatively rare, so don't worry about them
506 // for now.
507 if (Subtarget->hasBFE())
509
510 // Clamp modifier on add/sub
511 if (Subtarget->hasIntClamp())
513
514 if (Subtarget->hasAddNoCarry())
515 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
516 Legal);
517
518 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
519 Custom);
520
521 // These are really only legal for ieee_mode functions. We should be avoiding
522 // them for functions that don't have ieee_mode enabled, so just say they are
523 // legal.
525 {MVT::f32, MVT::f64}, Legal);
526
527 if (Subtarget->haveRoundOpsF64())
529 Legal);
530 else
532 MVT::f64, Custom);
533
535 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
536 Legal);
537 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
538
541
542 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
543 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544
545 // Custom lower these because we can't specify a rule based on an illegal
546 // source bf16.
549
550 if (Subtarget->has16BitInsts()) {
553 MVT::i16, Legal);
554
555 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
556
558 MVT::i16, Expand);
559
563 ISD::CTPOP},
564 MVT::i16, Promote);
565
567
568 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
569
571 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
573 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
574
578
580
581 // F16 - Constant Actions.
584
585 // F16 - Load/Store Actions.
587 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
589 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
590
591 // BF16 - Load/Store Actions.
593 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
595 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
596
597 // F16 - VOP1 Actions.
600 MVT::f16, Custom);
601
604
605 // F16 - VOP2 Actions.
606 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
607 Expand);
611
612 // F16 - VOP3 Actions.
614 if (STI.hasMadF16())
616
617 for (MVT VT :
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
621 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
622 switch (Op) {
623 case ISD::LOAD:
624 case ISD::STORE:
626 case ISD::BITCAST:
627 case ISD::UNDEF:
632 case ISD::IS_FPCLASS:
633 break;
637 break;
638 default:
640 break;
641 }
642 }
643 }
644
645 // v_perm_b32 can handle either of these.
646 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
648
649 // XXX - Do these do anything? Vector constants turn into build_vector.
650 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
651
652 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
653 Legal);
654
656 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
658 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
659
661 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
664
665 setOperationAction(ISD::AND, MVT::v2i16, Promote);
666 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
667 setOperationAction(ISD::OR, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
671
673 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
675 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
676 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
677 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
678
680 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
682 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
684 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
685
687 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
689 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
690 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
691 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
692
694 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
696 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
701 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
703 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
704
705 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
707 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
711
713 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
715 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
716 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
718
719 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
720 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
721 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
725
727 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
729 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
730 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
732
734 MVT::v2i32, Expand);
736
738 MVT::v4i32, Expand);
739
741 MVT::v8i32, Expand);
742
743 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
744 Subtarget->hasVOP3PInsts() ? Legal : Custom);
745
746 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
747 // This isn't really legal, but this avoids the legalizer unrolling it (and
748 // allows matching fneg (fabs x) patterns)
749 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
750
753
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
788 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
789 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
790 Custom);
791
792 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
793 // Split vector operations.
798 VT, Custom);
799
800 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
801 // Split vector operations.
803 VT, Custom);
804
805 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
806 Custom);
807
808 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
809 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
810 Custom);
811
812 if (Subtarget->hasPackedFP32Ops()) {
814 MVT::v2f32, Legal);
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
817 Custom);
818 }
819 }
820
822
823 if (Subtarget->has16BitInsts()) {
825 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
827 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
828 } else {
829 // Legalization hack.
830 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
831
833 }
834
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
840 Custom);
841
843
844 if (Subtarget->hasScalarSMulU64())
846
847 if (Subtarget->hasMad64_32())
849
850 if (Subtarget->hasPrefetch())
852
853 if (Subtarget->hasIEEEMinMax()) {
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
858 Custom);
859 } else {
860 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
861 if (Subtarget->hasMinimum3Maximum3F32())
863
864 if (Subtarget->hasMinimum3Maximum3PKF16())
866 }
867
869 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
870 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
871 MVT::i8},
872 Custom);
873
875 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
876 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
877 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
878 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
879 Custom);
880
882 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
883 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
884 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
885 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
886 Custom);
887
893
894 // TODO: Could move this to custom lowering, could benefit from combines on
895 // extract of relevant bits.
897
899
900 if (Subtarget->hasBF16ConversionInsts()) {
904 }
905
906 if (Subtarget->hasCvtPkF16F32Inst()) {
908 }
909
912 ISD::SUB,
914 ISD::MUL,
915 ISD::FADD,
916 ISD::FSUB,
917 ISD::FDIV,
918 ISD::FMUL,
925 ISD::FMA,
926 ISD::SMIN,
927 ISD::SMAX,
928 ISD::UMIN,
929 ISD::UMAX,
932 ISD::SMIN,
933 ISD::SMAX,
934 ISD::UMIN,
935 ISD::UMAX,
936 ISD::AND,
937 ISD::OR,
938 ISD::XOR,
939 ISD::SHL,
940 ISD::SRL,
941 ISD::SRA,
942 ISD::FSHR,
952
953 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
955
956 // All memory operations. Some folding on the pointer operand is done to help
957 // matching the constant offsets in the addressing modes.
982
983 // FIXME: In other contexts we pretend this is a per-function property.
985
987}
988
989const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
990
992 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
993 return RCRegs;
994}
995
996//===----------------------------------------------------------------------===//
997// TargetLowering queries
998//===----------------------------------------------------------------------===//
999
1000// v_mad_mix* support a conversion from f16 to f32.
1001//
1002// There is only one special case when denormals are enabled we don't currently,
1003// where this is OK to use.
1004bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1005 EVT DestVT, EVT SrcVT) const {
1006 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1007 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1008 DestVT.getScalarType() == MVT::f32 &&
1009 SrcVT.getScalarType() == MVT::f16 &&
1010 // TODO: This probably only requires no input flushing?
1012}
1013
1015 LLT DestTy, LLT SrcTy) const {
1016 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1017 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1018 DestTy.getScalarSizeInBits() == 32 &&
1019 SrcTy.getScalarSizeInBits() == 16 &&
1020 // TODO: This probably only requires no input flushing?
1021 denormalModeIsFlushAllF32(*MI.getMF());
1022}
1023
1025 // SI has some legal vector types, but no legal vector operations. Say no
1026 // shuffles are legal in order to prefer scalarizing some vector operations.
1027 return false;
1028}
1029
1032 EVT VT) const {
1035
1036 if (VT.isVector()) {
1037 EVT ScalarVT = VT.getScalarType();
1038 unsigned Size = ScalarVT.getSizeInBits();
1039 if (Size == 16) {
1040 if (Subtarget->has16BitInsts()) {
1041 if (VT.isInteger())
1042 return MVT::v2i16;
1043 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1044 }
1045 return VT.isInteger() ? MVT::i32 : MVT::f32;
1046 }
1047
1048 if (Size < 16)
1049 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1050 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1051 }
1052
1053 if (VT.getSizeInBits() > 32)
1054 return MVT::i32;
1055
1057}
1058
1061 EVT VT) const {
1064
1065 if (VT.isVector()) {
1066 unsigned NumElts = VT.getVectorNumElements();
1067 EVT ScalarVT = VT.getScalarType();
1068 unsigned Size = ScalarVT.getSizeInBits();
1069
1070 // FIXME: Should probably promote 8-bit vectors to i16.
1071 if (Size == 16 && Subtarget->has16BitInsts())
1072 return (NumElts + 1) / 2;
1073
1074 if (Size <= 32)
1075 return NumElts;
1076
1077 if (Size > 32)
1078 return NumElts * ((Size + 31) / 32);
1079 } else if (VT.getSizeInBits() > 32)
1080 return (VT.getSizeInBits() + 31) / 32;
1081
1083}
1084
1086 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1087 unsigned &NumIntermediates, MVT &RegisterVT) const {
1088 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1089 unsigned NumElts = VT.getVectorNumElements();
1090 EVT ScalarVT = VT.getScalarType();
1091 unsigned Size = ScalarVT.getSizeInBits();
1092 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1093 // support, but unless we can properly handle 3-vectors, it will be still be
1094 // inconsistent.
1095 if (Size == 16 && Subtarget->has16BitInsts()) {
1096 if (ScalarVT == MVT::bf16) {
1097 RegisterVT = MVT::i32;
1098 IntermediateVT = MVT::v2bf16;
1099 } else {
1100 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1101 IntermediateVT = RegisterVT;
1102 }
1103 NumIntermediates = (NumElts + 1) / 2;
1104 return NumIntermediates;
1105 }
1106
1107 if (Size == 32) {
1108 RegisterVT = ScalarVT.getSimpleVT();
1109 IntermediateVT = RegisterVT;
1110 NumIntermediates = NumElts;
1111 return NumIntermediates;
1112 }
1113
1114 if (Size < 16 && Subtarget->has16BitInsts()) {
1115 // FIXME: Should probably form v2i16 pieces
1116 RegisterVT = MVT::i16;
1117 IntermediateVT = ScalarVT;
1118 NumIntermediates = NumElts;
1119 return NumIntermediates;
1120 }
1121
1122 if (Size != 16 && Size <= 32) {
1123 RegisterVT = MVT::i32;
1124 IntermediateVT = ScalarVT;
1125 NumIntermediates = NumElts;
1126 return NumIntermediates;
1127 }
1128
1129 if (Size > 32) {
1130 RegisterVT = MVT::i32;
1131 IntermediateVT = RegisterVT;
1132 NumIntermediates = NumElts * ((Size + 31) / 32);
1133 return NumIntermediates;
1134 }
1135 }
1136
1138 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1139}
1140
1142 const DataLayout &DL, Type *Ty,
1143 unsigned MaxNumLanes) {
1144 assert(MaxNumLanes != 0);
1145
1146 LLVMContext &Ctx = Ty->getContext();
1147 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1148 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1149 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1150 NumElts);
1151 }
1152
1153 return TLI.getValueType(DL, Ty);
1154}
1155
1156// Peek through TFE struct returns to only use the data size.
1158 const DataLayout &DL, Type *Ty,
1159 unsigned MaxNumLanes) {
1160 auto *ST = dyn_cast<StructType>(Ty);
1161 if (!ST)
1162 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1163
1164 // TFE intrinsics return an aggregate type.
1165 assert(ST->getNumContainedTypes() == 2 &&
1166 ST->getContainedType(1)->isIntegerTy(32));
1167 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1168}
1169
1170/// Map address space 7 to MVT::v5i32 because that's its in-memory
1171/// representation. This return value is vector-typed because there is no
1172/// MVT::i160 and it is not clear if one can be added. While this could
1173/// cause issues during codegen, these address space 7 pointers will be
1174/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1175/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1176/// modeling, to work.
1178 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1179 return MVT::v5i32;
1181 DL.getPointerSizeInBits(AS) == 192)
1182 return MVT::v6i32;
1184}
1185/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1186/// v8i32 when padding is added.
1187/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1188/// also v8i32 with padding.
1190 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1191 DL.getPointerSizeInBits(AS) == 160) ||
1193 DL.getPointerSizeInBits(AS) == 192))
1194 return MVT::v8i32;
1196}
1197
1199 const CallInst &CI,
1200 MachineFunction &MF,
1201 unsigned IntrID) const {
1203 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1205 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1207 Info.flags |= getTargetMMOFlags(CI);
1208
1209 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1211 AttributeList Attr =
1213 MemoryEffects ME = Attr.getMemoryEffects();
1214 if (ME.doesNotAccessMemory())
1215 return false;
1216
1217 // TODO: Should images get their own address space?
1218 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1219
1220 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1221 if (RsrcIntr->IsImage) {
1224 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1225 Info.align.reset();
1226 }
1227
1228 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1229 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1230 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1231 // We conservatively set the memory operand of a buffer intrinsic to the
1232 // base resource pointer, so that we can access alias information about
1233 // those pointers. Cases like "this points at the same value
1234 // but with a different offset" are handled in
1235 // areMemAccessesTriviallyDisjoint.
1236 Info.ptrVal = RsrcArg;
1237 }
1238
1239 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1240 if (!IsSPrefetch) {
1241 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1242 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1244 }
1245
1247 if (ME.onlyReadsMemory()) {
1248 if (RsrcIntr->IsImage) {
1249 unsigned MaxNumLanes = 4;
1250
1251 if (!BaseOpcode->Gather4) {
1252 // If this isn't a gather, we may have excess loaded elements in the
1253 // IR type. Check the dmask for the real number of elements loaded.
1254 unsigned DMask =
1255 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1256 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1257 }
1258
1259 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1260 CI.getType(), MaxNumLanes);
1261 } else {
1262 Info.memVT =
1264 std::numeric_limits<unsigned>::max());
1265 }
1266
1267 // FIXME: What does alignment mean for an image?
1270 } else if (ME.onlyWritesMemory()) {
1272
1273 Type *DataTy = CI.getArgOperand(0)->getType();
1274 if (RsrcIntr->IsImage) {
1275 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1276 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1277 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1278 DMaskLanes);
1279 } else
1280 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1281
1283 } else {
1284 // Atomic, NoReturn Sampler or prefetch
1287 Info.flags |=
1289
1290 if (!IsSPrefetch)
1292
1293 switch (IntrID) {
1294 default:
1295 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1296 // Fake memory access type for no return sampler intrinsics
1297 Info.memVT = MVT::i32;
1298 } else {
1299 // XXX - Should this be volatile without known ordering?
1301 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1302 }
1303 break;
1304 case Intrinsic::amdgcn_raw_buffer_load_lds:
1305 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1306 case Intrinsic::amdgcn_struct_buffer_load_lds:
1307 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1308 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1309 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1310 Info.ptrVal = CI.getArgOperand(1);
1311 return true;
1312 }
1313 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1314 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1315 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1316 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1317 Info.memVT =
1319 std::numeric_limits<unsigned>::max());
1320 Info.flags &= ~MachineMemOperand::MOStore;
1321 return true;
1322 }
1323 }
1324 }
1325 return true;
1326 }
1327
1328 switch (IntrID) {
1329 case Intrinsic::amdgcn_ds_ordered_add:
1330 case Intrinsic::amdgcn_ds_ordered_swap: {
1332 Info.memVT = MVT::getVT(CI.getType());
1333 Info.ptrVal = CI.getOperand(0);
1334 Info.align.reset();
1336
1337 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1338 if (!Vol->isZero())
1340
1341 return true;
1342 }
1343 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1344 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1346 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1347 Info.ptrVal = nullptr;
1348 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1350 return true;
1351 }
1352 case Intrinsic::amdgcn_ds_append:
1353 case Intrinsic::amdgcn_ds_consume: {
1355 Info.memVT = MVT::getVT(CI.getType());
1356 Info.ptrVal = CI.getOperand(0);
1357 Info.align.reset();
1359
1360 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1361 if (!Vol->isZero())
1363
1364 return true;
1365 }
1366 case Intrinsic::amdgcn_global_atomic_csub: {
1368 Info.memVT = MVT::getVT(CI.getType());
1369 Info.ptrVal = CI.getOperand(0);
1370 Info.align.reset();
1373 return true;
1374 }
1375 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1377 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1378
1379 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1380 Info.align.reset();
1381 Info.flags |=
1383 return true;
1384 }
1385 case Intrinsic::amdgcn_global_atomic_fmin_num:
1386 case Intrinsic::amdgcn_global_atomic_fmax_num:
1387 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1388 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1389 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1390 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1392 Info.memVT = MVT::getVT(CI.getType());
1393 Info.ptrVal = CI.getOperand(0);
1394 Info.align.reset();
1398 return true;
1399 }
1400 case Intrinsic::amdgcn_global_load_tr_b64:
1401 case Intrinsic::amdgcn_global_load_tr_b128:
1402 case Intrinsic::amdgcn_ds_read_tr4_b64:
1403 case Intrinsic::amdgcn_ds_read_tr6_b96:
1404 case Intrinsic::amdgcn_ds_read_tr8_b64:
1405 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1407 Info.memVT = MVT::getVT(CI.getType());
1408 Info.ptrVal = CI.getOperand(0);
1409 Info.align.reset();
1411 return true;
1412 }
1413 case Intrinsic::amdgcn_ds_gws_init:
1414 case Intrinsic::amdgcn_ds_gws_barrier:
1415 case Intrinsic::amdgcn_ds_gws_sema_v:
1416 case Intrinsic::amdgcn_ds_gws_sema_br:
1417 case Intrinsic::amdgcn_ds_gws_sema_p:
1418 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1420
1421 const GCNTargetMachine &TM =
1422 static_cast<const GCNTargetMachine &>(getTargetMachine());
1423
1425 Info.ptrVal = MFI->getGWSPSV(TM);
1426
1427 // This is an abstract access, but we need to specify a type and size.
1428 Info.memVT = MVT::i32;
1429 Info.size = 4;
1430 Info.align = Align(4);
1431
1432 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1434 else
1436 return true;
1437 }
1438 case Intrinsic::amdgcn_global_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1441 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1442 Info.ptrVal = CI.getArgOperand(1);
1444 return true;
1445 }
1446 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1448
1449 const GCNTargetMachine &TM =
1450 static_cast<const GCNTargetMachine &>(getTargetMachine());
1451
1453 Info.ptrVal = MFI->getGWSPSV(TM);
1454
1455 // This is an abstract access, but we need to specify a type and size.
1456 Info.memVT = MVT::i32;
1457 Info.size = 4;
1458 Info.align = Align(4);
1459
1461 return true;
1462 }
1463 case Intrinsic::amdgcn_s_prefetch_data: {
1465 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1466 Info.ptrVal = CI.getArgOperand(0);
1468 return true;
1469 }
1470 default:
1471 return false;
1472 }
1473}
1474
1476 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1477 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1478 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1479 // The DAG's ValueType loses the addrspaces.
1480 // Add them as 2 extra Constant operands "from" and "to".
1481 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1482 unsigned DstAS = I.getType()->getPointerAddressSpace();
1483 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1484 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1485 break;
1486 }
1487 default:
1488 break;
1489 }
1490}
1491
1494 Type *&AccessTy) const {
1495 Value *Ptr = nullptr;
1496 switch (II->getIntrinsicID()) {
1497 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1498 case Intrinsic::amdgcn_ds_append:
1499 case Intrinsic::amdgcn_ds_consume:
1500 case Intrinsic::amdgcn_ds_read_tr4_b64:
1501 case Intrinsic::amdgcn_ds_read_tr6_b96:
1502 case Intrinsic::amdgcn_ds_read_tr8_b64:
1503 case Intrinsic::amdgcn_ds_read_tr16_b64:
1504 case Intrinsic::amdgcn_ds_ordered_add:
1505 case Intrinsic::amdgcn_ds_ordered_swap:
1506 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1507 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1508 case Intrinsic::amdgcn_global_atomic_csub:
1509 case Intrinsic::amdgcn_global_atomic_fmax_num:
1510 case Intrinsic::amdgcn_global_atomic_fmin_num:
1511 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1512 case Intrinsic::amdgcn_global_load_tr_b64:
1513 case Intrinsic::amdgcn_global_load_tr_b128:
1514 Ptr = II->getArgOperand(0);
1515 break;
1516 case Intrinsic::amdgcn_global_load_lds:
1517 Ptr = II->getArgOperand(1);
1518 break;
1519 default:
1520 return false;
1521 }
1522 AccessTy = II->getType();
1523 Ops.push_back(Ptr);
1524 return true;
1525}
1526
1528 unsigned AddrSpace) const {
1529 if (!Subtarget->hasFlatInstOffsets()) {
1530 // Flat instructions do not have offsets, and only have the register
1531 // address.
1532 return AM.BaseOffs == 0 && AM.Scale == 0;
1533 }
1534
1535 decltype(SIInstrFlags::FLAT) FlatVariant =
1539
1540 return AM.Scale == 0 &&
1541 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1542 AM.BaseOffs, AddrSpace, FlatVariant));
1543}
1544
1546 if (Subtarget->hasFlatGlobalInsts())
1548
1549 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1550 // Assume the we will use FLAT for all global memory accesses
1551 // on VI.
1552 // FIXME: This assumption is currently wrong. On VI we still use
1553 // MUBUF instructions for the r + i addressing mode. As currently
1554 // implemented, the MUBUF instructions only work on buffer < 4GB.
1555 // It may be possible to support > 4GB buffers with MUBUF instructions,
1556 // by setting the stride value in the resource descriptor which would
1557 // increase the size limit to (stride * 4GB). However, this is risky,
1558 // because it has never been validated.
1560 }
1561
1562 return isLegalMUBUFAddressingMode(AM);
1563}
1564
1565bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1566 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1567 // additionally can do r + r + i with addr64. 32-bit has more addressing
1568 // mode options. Depending on the resource constant, it can also do
1569 // (i64 r0) + (i32 r1) * (i14 i).
1570 //
1571 // Private arrays end up using a scratch buffer most of the time, so also
1572 // assume those use MUBUF instructions. Scratch loads / stores are currently
1573 // implemented as mubuf instructions with offen bit set, so slightly
1574 // different than the normal addr64.
1575 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1576 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1577 return false;
1578
1579 // FIXME: Since we can split immediate into soffset and immediate offset,
1580 // would it make sense to allow any immediate?
1581
1582 switch (AM.Scale) {
1583 case 0: // r + i or just i, depending on HasBaseReg.
1584 return true;
1585 case 1:
1586 return true; // We have r + r or r + i.
1587 case 2:
1588 if (AM.HasBaseReg) {
1589 // Reject 2 * r + r.
1590 return false;
1591 }
1592
1593 // Allow 2 * r as r + r
1594 // Or 2 * r + i is allowed as r + r + i.
1595 return true;
1596 default: // Don't allow n * r
1597 return false;
1598 }
1599}
1600
1602 const AddrMode &AM, Type *Ty,
1603 unsigned AS,
1604 Instruction *I) const {
1605 // No global is ever allowed as a base.
1606 if (AM.BaseGV)
1607 return false;
1608
1609 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1610 return isLegalGlobalAddressingMode(AM);
1611
1612 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1616 // If the offset isn't a multiple of 4, it probably isn't going to be
1617 // correctly aligned.
1618 // FIXME: Can we get the real alignment here?
1619 if (AM.BaseOffs % 4 != 0)
1620 return isLegalMUBUFAddressingMode(AM);
1621
1622 if (!Subtarget->hasScalarSubwordLoads()) {
1623 // There are no SMRD extloads, so if we have to do a small type access we
1624 // will use a MUBUF load.
1625 // FIXME?: We also need to do this if unaligned, but we don't know the
1626 // alignment here.
1627 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1628 return isLegalGlobalAddressingMode(AM);
1629 }
1630
1632 // SMRD instructions have an 8-bit, dword offset on SI.
1633 if (!isUInt<8>(AM.BaseOffs / 4))
1634 return false;
1635 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1636 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1637 // in 8-bits, it can use a smaller encoding.
1638 if (!isUInt<32>(AM.BaseOffs / 4))
1639 return false;
1640 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1641 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1642 if (!isUInt<20>(AM.BaseOffs))
1643 return false;
1644 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1645 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1646 // for S_BUFFER_* instructions).
1647 if (!isInt<21>(AM.BaseOffs))
1648 return false;
1649 } else {
1650 // On GFX12, all offsets are signed 24-bit in bytes.
1651 if (!isInt<24>(AM.BaseOffs))
1652 return false;
1653 }
1654
1655 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1657 AM.BaseOffs < 0) {
1658 // Scalar (non-buffer) loads can only use a negative offset if
1659 // soffset+offset is non-negative. Since the compiler can only prove that
1660 // in a few special cases, it is safer to claim that negative offsets are
1661 // not supported.
1662 return false;
1663 }
1664
1665 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1666 return true;
1667
1668 if (AM.Scale == 1 && AM.HasBaseReg)
1669 return true;
1670
1671 return false;
1672 }
1673
1674 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1675 return Subtarget->enableFlatScratch()
1677 : isLegalMUBUFAddressingMode(AM);
1678
1679 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1680 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1681 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1682 // field.
1683 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1684 // an 8-bit dword offset but we don't know the alignment here.
1685 if (!isUInt<16>(AM.BaseOffs))
1686 return false;
1687
1688 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1689 return true;
1690
1691 if (AM.Scale == 1 && AM.HasBaseReg)
1692 return true;
1693
1694 return false;
1695 }
1696
1698 // For an unknown address space, this usually means that this is for some
1699 // reason being used for pure arithmetic, and not based on some addressing
1700 // computation. We don't have instructions that compute pointers with any
1701 // addressing modes, so treat them as having no offset like flat
1702 // instructions.
1704 }
1705
1706 // Assume a user alias of global for unknown address spaces.
1707 return isLegalGlobalAddressingMode(AM);
1708}
1709
1711 const MachineFunction &MF) const {
1713 return (MemVT.getSizeInBits() <= 4 * 32);
1714 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1715 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1716 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1717 }
1719 return (MemVT.getSizeInBits() <= 2 * 32);
1720 return true;
1721}
1722
1724 unsigned Size, unsigned AddrSpace, Align Alignment,
1725 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1726 if (IsFast)
1727 *IsFast = 0;
1728
1729 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1730 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1731 // Check if alignment requirements for ds_read/write instructions are
1732 // disabled.
1733 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1734 return false;
1735
1736 Align RequiredAlignment(
1737 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1738 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1739 Alignment < RequiredAlignment)
1740 return false;
1741
1742 // Either, the alignment requirements are "enabled", or there is an
1743 // unaligned LDS access related hardware bug though alignment requirements
1744 // are "disabled". In either case, we need to check for proper alignment
1745 // requirements.
1746 //
1747 switch (Size) {
1748 case 64:
1749 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1750 // address is negative, then the instruction is incorrectly treated as
1751 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1752 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1753 // load later in the SILoadStoreOptimizer.
1754 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1755 return false;
1756
1757 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1758 // can do a 4 byte aligned, 8 byte access in a single operation using
1759 // ds_read2/write2_b32 with adjacent offsets.
1760 RequiredAlignment = Align(4);
1761
1762 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1763 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1764 // ds_write2_b32 depending on the alignment. In either case with either
1765 // alignment there is no faster way of doing this.
1766
1767 // The numbers returned here and below are not additive, it is a 'speed
1768 // rank'. They are just meant to be compared to decide if a certain way
1769 // of lowering an operation is faster than another. For that purpose
1770 // naturally aligned operation gets it bitsize to indicate that "it
1771 // operates with a speed comparable to N-bit wide load". With the full
1772 // alignment ds128 is slower than ds96 for example. If underaligned it
1773 // is comparable to a speed of a single dword access, which would then
1774 // mean 32 < 128 and it is faster to issue a wide load regardless.
1775 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1776 // wider load which will not be aligned anymore the latter is slower.
1777 if (IsFast)
1778 *IsFast = (Alignment >= RequiredAlignment) ? 64
1779 : (Alignment < Align(4)) ? 32
1780 : 1;
1781 return true;
1782 }
1783
1784 break;
1785 case 96:
1786 if (!Subtarget->hasDS96AndDS128())
1787 return false;
1788
1789 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1790 // gfx8 and older.
1791
1792 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1793 // Naturally aligned access is fastest. However, also report it is Fast
1794 // if memory is aligned less than DWORD. A narrow load or store will be
1795 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1796 // be more of them, so overall we will pay less penalty issuing a single
1797 // instruction.
1798
1799 // See comment on the values above.
1800 if (IsFast)
1801 *IsFast = (Alignment >= RequiredAlignment) ? 96
1802 : (Alignment < Align(4)) ? 32
1803 : 1;
1804 return true;
1805 }
1806
1807 break;
1808 case 128:
1809 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1810 return false;
1811
1812 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1813 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1814 // single operation using ds_read2/write2_b64.
1815 RequiredAlignment = Align(8);
1816
1817 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1818 // Naturally aligned access is fastest. However, also report it is Fast
1819 // if memory is aligned less than DWORD. A narrow load or store will be
1820 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1821 // will be more of them, so overall we will pay less penalty issuing a
1822 // single instruction.
1823
1824 // See comment on the values above.
1825 if (IsFast)
1826 *IsFast = (Alignment >= RequiredAlignment) ? 128
1827 : (Alignment < Align(4)) ? 32
1828 : 1;
1829 return true;
1830 }
1831
1832 break;
1833 default:
1834 if (Size > 32)
1835 return false;
1836
1837 break;
1838 }
1839
1840 // See comment on the values above.
1841 // Note that we have a single-dword or sub-dword here, so if underaligned
1842 // it is a slowest possible access, hence returned value is 0.
1843 if (IsFast)
1844 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1845
1846 return Alignment >= RequiredAlignment ||
1847 Subtarget->hasUnalignedDSAccessEnabled();
1848 }
1849
1850 // FIXME: We have to be conservative here and assume that flat operations
1851 // will access scratch. If we had access to the IR function, then we
1852 // could determine if any private memory was used in the function.
1853 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1854 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1855 bool AlignedBy4 = Alignment >= Align(4);
1856 if (IsFast)
1857 *IsFast = AlignedBy4;
1858
1859 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1860 }
1861
1862 // So long as they are correct, wide global memory operations perform better
1863 // than multiple smaller memory ops -- even when misaligned
1864 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1865 if (IsFast)
1866 *IsFast = Size;
1867
1868 return Alignment >= Align(4) ||
1870 }
1871
1872 // Smaller than dword value must be aligned.
1873 if (Size < 32)
1874 return false;
1875
1876 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1877 // byte-address are ignored, thus forcing Dword alignment.
1878 // This applies to private, global, and constant memory.
1879 if (IsFast)
1880 *IsFast = 1;
1881
1882 return Size >= 32 && Alignment >= Align(4);
1883}
1884
1886 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1887 unsigned *IsFast) const {
1889 Alignment, Flags, IsFast);
1890}
1891
1893 const MemOp &Op, const AttributeList &FuncAttributes) const {
1894 // FIXME: Should account for address space here.
1895
1896 // The default fallback uses the private pointer size as a guess for a type to
1897 // use. Make sure we switch these to 64-bit accesses.
1898
1899 if (Op.size() >= 16 &&
1900 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1901 return MVT::v4i32;
1902
1903 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1904 return MVT::v2i32;
1905
1906 // Use the default.
1907 return MVT::Other;
1908}
1909
1911 const MemSDNode *MemNode = cast<MemSDNode>(N);
1912 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1913}
1914
1916 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1918}
1919
1921 unsigned DestAS) const {
1922 // Flat -> private/local is a simple truncate.
1923 // Flat -> global is no-op
1924 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1925 return true;
1926
1927 const GCNTargetMachine &TM =
1928 static_cast<const GCNTargetMachine &>(getTargetMachine());
1929 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1930}
1931
1934 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1935 VT.getScalarType().bitsLE(MVT::i16))
1938}
1939
1941 Type *Ty) const {
1942 // FIXME: Could be smarter if called for vector constants.
1943 return true;
1944}
1945
1947 unsigned Index) const {
1949 return false;
1950
1951 // TODO: Add more cases that are cheap.
1952 return Index == 0;
1953}
1954
1955bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
1956 // TODO: This should be more aggressive, particular for 16-bit element
1957 // vectors. However there are some mixed improvements and regressions.
1958 EVT EltTy = VT.getVectorElementType();
1959 return EltTy.getSizeInBits() % 32 == 0;
1960}
1961
1963 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1964 switch (Op) {
1965 case ISD::LOAD:
1966 case ISD::STORE:
1967 return true;
1968 default:
1969 return false;
1970 }
1971 }
1972
1973 // SimplifySetCC uses this function to determine whether or not it should
1974 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1975 if (VT == MVT::i1 && Op == ISD::SETCC)
1976 return false;
1977
1979}
1980
1981SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1982 const SDLoc &SL,
1983 SDValue Chain,
1984 uint64_t Offset) const {
1985 const DataLayout &DL = DAG.getDataLayout();
1989
1990 auto [InputPtrReg, RC, ArgTy] =
1992
1993 // We may not have the kernarg segment argument if we have no kernel
1994 // arguments.
1995 if (!InputPtrReg)
1996 return DAG.getConstant(Offset, SL, PtrVT);
1997
1999 SDValue BasePtr = DAG.getCopyFromReg(
2000 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2001
2002 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2003}
2004
2005SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2006 const SDLoc &SL) const {
2009 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2010}
2011
2012SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2013 const SDLoc &SL) const {
2014
2016 std::optional<uint32_t> KnownSize =
2018 if (KnownSize.has_value())
2019 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2020 return SDValue();
2021}
2022
2023SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2024 const SDLoc &SL, SDValue Val,
2025 bool Signed,
2026 const ISD::InputArg *Arg) const {
2027 // First, if it is a widened vector, narrow it.
2028 if (VT.isVector() &&
2030 EVT NarrowedVT =
2033 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2034 DAG.getConstant(0, SL, MVT::i32));
2035 }
2036
2037 // Then convert the vector elements or scalar value.
2038 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2039 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2040 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2041 }
2042
2043 if (MemVT.isFloatingPoint())
2044 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2045 else if (Signed)
2046 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2047 else
2048 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2049
2050 return Val;
2051}
2052
2053SDValue SITargetLowering::lowerKernargMemParameter(
2054 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2055 uint64_t Offset, Align Alignment, bool Signed,
2056 const ISD::InputArg *Arg) const {
2058
2059 // Try to avoid using an extload by loading earlier than the argument address,
2060 // and extracting the relevant bits. The load should hopefully be merged with
2061 // the previous argument.
2062 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2063 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2064 int64_t AlignDownOffset = alignDown(Offset, 4);
2065 int64_t OffsetDiff = Offset - AlignDownOffset;
2066
2067 EVT IntVT = MemVT.changeTypeToInteger();
2068
2069 // TODO: If we passed in the base kernel offset we could have a better
2070 // alignment than 4, but we don't really need it.
2071 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2072 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2075
2076 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2077 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2078
2079 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2080 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2081 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2082
2083 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2084 }
2085
2086 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2087 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2090
2091 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2092 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2093}
2094
2095SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2096 CCValAssign &VA, const SDLoc &SL,
2097 SDValue Chain,
2098 const ISD::InputArg &Arg) const {
2100 MachineFrameInfo &MFI = MF.getFrameInfo();
2101
2102 if (Arg.Flags.isByVal()) {
2103 unsigned Size = Arg.Flags.getByValSize();
2104 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2105 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2106 }
2107
2108 unsigned ArgOffset = VA.getLocMemOffset();
2109 unsigned ArgSize = VA.getValVT().getStoreSize();
2110
2111 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2112
2113 // Create load nodes to retrieve arguments from the stack.
2114 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2115 SDValue ArgValue;
2116
2117 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2119 MVT MemVT = VA.getValVT();
2120
2121 switch (VA.getLocInfo()) {
2122 default:
2123 break;
2124 case CCValAssign::BCvt:
2125 MemVT = VA.getLocVT();
2126 break;
2127 case CCValAssign::SExt:
2128 ExtType = ISD::SEXTLOAD;
2129 break;
2130 case CCValAssign::ZExt:
2131 ExtType = ISD::ZEXTLOAD;
2132 break;
2133 case CCValAssign::AExt:
2134 ExtType = ISD::EXTLOAD;
2135 break;
2136 }
2137
2138 ArgValue = DAG.getExtLoad(
2139 ExtType, SL, VA.getLocVT(), Chain, FIN,
2141 return ArgValue;
2142}
2143
2144SDValue SITargetLowering::getPreloadedValue(
2145 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2147 const ArgDescriptor *Reg = nullptr;
2148 const TargetRegisterClass *RC;
2149 LLT Ty;
2150
2152 const ArgDescriptor WorkGroupIDX =
2153 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2154 // If GridZ is not programmed in an entry function then the hardware will set
2155 // it to all zeros, so there is no need to mask the GridY value in the low
2156 // order bits.
2157 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2158 AMDGPU::TTMP7,
2159 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2160 const ArgDescriptor WorkGroupIDZ =
2161 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2162 if (Subtarget->hasArchitectedSGPRs() &&
2164 switch (PVID) {
2166 Reg = &WorkGroupIDX;
2167 RC = &AMDGPU::SReg_32RegClass;
2168 Ty = LLT::scalar(32);
2169 break;
2171 Reg = &WorkGroupIDY;
2172 RC = &AMDGPU::SReg_32RegClass;
2173 Ty = LLT::scalar(32);
2174 break;
2176 Reg = &WorkGroupIDZ;
2177 RC = &AMDGPU::SReg_32RegClass;
2178 Ty = LLT::scalar(32);
2179 break;
2180 default:
2181 break;
2182 }
2183 }
2184
2185 if (!Reg)
2186 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2187 if (!Reg) {
2189 // It's possible for a kernarg intrinsic call to appear in a kernel with
2190 // no allocated segment, in which case we do not add the user sgpr
2191 // argument, so just return null.
2192 return DAG.getConstant(0, SDLoc(), VT);
2193 }
2194
2195 // It's undefined behavior if a function marked with the amdgpu-no-*
2196 // attributes uses the corresponding intrinsic.
2197 return DAG.getUNDEF(VT);
2198 }
2199
2200 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2201}
2202
2204 CallingConv::ID CallConv,
2205 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2206 FunctionType *FType,
2207 SIMachineFunctionInfo *Info) {
2208 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2209 const ISD::InputArg *Arg = &Ins[I];
2210
2211 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2212 "vector type argument should have been split");
2213
2214 // First check if it's a PS input addr.
2215 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2216 PSInputNum <= 15) {
2217 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2218
2219 // Inconveniently only the first part of the split is marked as isSplit,
2220 // so skip to the end. We only want to increment PSInputNum once for the
2221 // entire split argument.
2222 if (Arg->Flags.isSplit()) {
2223 while (!Arg->Flags.isSplitEnd()) {
2224 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2225 "unexpected vector split in ps argument type");
2226 if (!SkipArg)
2227 Splits.push_back(*Arg);
2228 Arg = &Ins[++I];
2229 }
2230 }
2231
2232 if (SkipArg) {
2233 // We can safely skip PS inputs.
2234 Skipped.set(Arg->getOrigArgIndex());
2235 ++PSInputNum;
2236 continue;
2237 }
2238
2239 Info->markPSInputAllocated(PSInputNum);
2240 if (Arg->Used)
2241 Info->markPSInputEnabled(PSInputNum);
2242
2243 ++PSInputNum;
2244 }
2245
2246 Splits.push_back(*Arg);
2247 }
2248}
2249
2250// Allocate special inputs passed in VGPRs.
2252 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2253 SIMachineFunctionInfo &Info) const {
2254 const LLT S32 = LLT::scalar(32);
2256
2257 if (Info.hasWorkItemIDX()) {
2258 Register Reg = AMDGPU::VGPR0;
2259 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2260
2261 CCInfo.AllocateReg(Reg);
2262 unsigned Mask =
2263 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2264 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2265 }
2266
2267 if (Info.hasWorkItemIDY()) {
2268 assert(Info.hasWorkItemIDX());
2269 if (Subtarget->hasPackedTID()) {
2270 Info.setWorkItemIDY(
2271 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2272 } else {
2273 unsigned Reg = AMDGPU::VGPR1;
2274 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2275
2276 CCInfo.AllocateReg(Reg);
2277 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2278 }
2279 }
2280
2281 if (Info.hasWorkItemIDZ()) {
2282 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2283 if (Subtarget->hasPackedTID()) {
2284 Info.setWorkItemIDZ(
2285 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2286 } else {
2287 unsigned Reg = AMDGPU::VGPR2;
2288 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2289
2290 CCInfo.AllocateReg(Reg);
2291 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2292 }
2293 }
2294}
2295
2296// Try to allocate a VGPR at the end of the argument list, or if no argument
2297// VGPRs are left allocating a stack slot.
2298// If \p Mask is is given it indicates bitfield position in the register.
2299// If \p Arg is given use it with new ]p Mask instead of allocating new.
2300static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2301 ArgDescriptor Arg = ArgDescriptor()) {
2302 if (Arg.isSet())
2303 return ArgDescriptor::createArg(Arg, Mask);
2304
2305 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2306 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2307 if (RegIdx == ArgVGPRs.size()) {
2308 // Spill to stack required.
2309 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2310
2311 return ArgDescriptor::createStack(Offset, Mask);
2312 }
2313
2314 unsigned Reg = ArgVGPRs[RegIdx];
2315 Reg = CCInfo.AllocateReg(Reg);
2316 assert(Reg != AMDGPU::NoRegister);
2317
2318 MachineFunction &MF = CCInfo.getMachineFunction();
2319 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2320 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2321 return ArgDescriptor::createRegister(Reg, Mask);
2322}
2323
2325 const TargetRegisterClass *RC,
2326 unsigned NumArgRegs) {
2327 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2328 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2329 if (RegIdx == ArgSGPRs.size())
2330 report_fatal_error("ran out of SGPRs for arguments");
2331
2332 unsigned Reg = ArgSGPRs[RegIdx];
2333 Reg = CCInfo.AllocateReg(Reg);
2334 assert(Reg != AMDGPU::NoRegister);
2335
2336 MachineFunction &MF = CCInfo.getMachineFunction();
2337 MF.addLiveIn(Reg, RC);
2339}
2340
2341// If this has a fixed position, we still should allocate the register in the
2342// CCInfo state. Technically we could get away with this for values passed
2343// outside of the normal argument range.
2345 const TargetRegisterClass *RC,
2346 MCRegister Reg) {
2347 Reg = CCInfo.AllocateReg(Reg);
2348 assert(Reg != AMDGPU::NoRegister);
2349 MachineFunction &MF = CCInfo.getMachineFunction();
2350 MF.addLiveIn(Reg, RC);
2351}
2352
2353static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2354 if (Arg) {
2355 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2356 Arg.getRegister());
2357 } else
2358 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2359}
2360
2361static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2362 if (Arg) {
2363 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2364 Arg.getRegister());
2365 } else
2366 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2367}
2368
2369/// Allocate implicit function VGPR arguments at the end of allocated user
2370/// arguments.
2372 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2373 SIMachineFunctionInfo &Info) const {
2374 const unsigned Mask = 0x3ff;
2375 ArgDescriptor Arg;
2376
2377 if (Info.hasWorkItemIDX()) {
2378 Arg = allocateVGPR32Input(CCInfo, Mask);
2379 Info.setWorkItemIDX(Arg);
2380 }
2381
2382 if (Info.hasWorkItemIDY()) {
2383 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2384 Info.setWorkItemIDY(Arg);
2385 }
2386
2387 if (Info.hasWorkItemIDZ())
2388 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2389}
2390
2391/// Allocate implicit function VGPR arguments in fixed registers.
2393 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2394 SIMachineFunctionInfo &Info) const {
2395 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2396 if (!Reg)
2397 report_fatal_error("failed to allocated VGPR for implicit arguments");
2398
2399 const unsigned Mask = 0x3ff;
2400 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2401 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2402 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2403}
2404
2406 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2407 SIMachineFunctionInfo &Info) const {
2408 auto &ArgInfo = Info.getArgInfo();
2409 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2410
2411 // TODO: Unify handling with private memory pointers.
2412 if (UserSGPRInfo.hasDispatchPtr())
2413 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2414
2415 if (UserSGPRInfo.hasQueuePtr())
2416 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2417
2418 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2419 // constant offset from the kernarg segment.
2420 if (Info.hasImplicitArgPtr())
2421 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2422
2423 if (UserSGPRInfo.hasDispatchID())
2424 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2425
2426 // flat_scratch_init is not applicable for non-kernel functions.
2427
2428 if (Info.hasWorkGroupIDX())
2429 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2430
2431 if (Info.hasWorkGroupIDY())
2432 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2433
2434 if (Info.hasWorkGroupIDZ())
2435 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2436
2437 if (Info.hasLDSKernelId())
2438 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2439}
2440
2441// Allocate special inputs passed in user SGPRs.
2443 MachineFunction &MF,
2444 const SIRegisterInfo &TRI,
2445 SIMachineFunctionInfo &Info) const {
2446 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2447 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2448 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2449 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2450 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2451 }
2452
2453 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2454 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2455 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2456 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2457 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2458 }
2459
2460 if (UserSGPRInfo.hasDispatchPtr()) {
2461 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2462 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2463 CCInfo.AllocateReg(DispatchPtrReg);
2464 }
2465
2466 if (UserSGPRInfo.hasQueuePtr()) {
2467 Register QueuePtrReg = Info.addQueuePtr(TRI);
2468 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2469 CCInfo.AllocateReg(QueuePtrReg);
2470 }
2471
2472 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2474 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2475 CCInfo.AllocateReg(InputPtrReg);
2476
2477 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2478 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2479 }
2480
2481 if (UserSGPRInfo.hasDispatchID()) {
2482 Register DispatchIDReg = Info.addDispatchID(TRI);
2483 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2484 CCInfo.AllocateReg(DispatchIDReg);
2485 }
2486
2487 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2488 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2489 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2490 CCInfo.AllocateReg(FlatScratchInitReg);
2491 }
2492
2493 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2494 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2495 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2496 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2497 }
2498
2499 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2500 // these from the dispatch pointer.
2501}
2502
2503// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2504// sequential starting from the first argument.
2506 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2508 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2509 Function &F = MF.getFunction();
2510 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2511 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2512 bool InPreloadSequence = true;
2513 unsigned InIdx = 0;
2514 bool AlignedForImplictArgs = false;
2515 unsigned ImplicitArgOffset = 0;
2516 for (auto &Arg : F.args()) {
2517 if (!InPreloadSequence || !Arg.hasInRegAttr())
2518 break;
2519
2520 unsigned ArgIdx = Arg.getArgNo();
2521 // Don't preload non-original args or parts not in the current preload
2522 // sequence.
2523 if (InIdx < Ins.size() &&
2524 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2525 break;
2526
2527 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2528 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2529 InIdx++) {
2530 assert(ArgLocs[ArgIdx].isMemLoc());
2531 auto &ArgLoc = ArgLocs[InIdx];
2532 const Align KernelArgBaseAlign = Align(16);
2533 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2534 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2535 unsigned NumAllocSGPRs =
2536 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2537
2538 // Fix alignment for hidden arguments.
2539 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2540 if (!AlignedForImplictArgs) {
2541 ImplicitArgOffset =
2542 alignTo(LastExplicitArgOffset,
2543 Subtarget->getAlignmentForImplicitArgPtr()) -
2544 LastExplicitArgOffset;
2545 AlignedForImplictArgs = true;
2546 }
2547 ArgOffset += ImplicitArgOffset;
2548 }
2549
2550 // Arg is preloaded into the previous SGPR.
2551 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2552 assert(InIdx >= 1 && "No previous SGPR");
2553 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2554 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2555 continue;
2556 }
2557
2558 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2559 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2560 // Check for free user SGPRs for preloading.
2561 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2562 InPreloadSequence = false;
2563 break;
2564 }
2565
2566 // Preload this argument.
2567 const TargetRegisterClass *RC =
2568 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2569 SmallVectorImpl<MCRegister> *PreloadRegs =
2570 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2571
2572 if (PreloadRegs->size() > 1)
2573 RC = &AMDGPU::SGPR_32RegClass;
2574 for (auto &Reg : *PreloadRegs) {
2575 assert(Reg);
2576 MF.addLiveIn(Reg, RC);
2577 CCInfo.AllocateReg(Reg);
2578 }
2579
2580 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2581 }
2582 }
2583}
2584
2586 const SIRegisterInfo &TRI,
2587 SIMachineFunctionInfo &Info) const {
2588 // Always allocate this last since it is a synthetic preload.
2589 if (Info.hasLDSKernelId()) {
2590 Register Reg = Info.addLDSKernelId();
2591 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2592 CCInfo.AllocateReg(Reg);
2593 }
2594}
2595
2596// Allocate special input registers that are initialized per-wave.
2599 CallingConv::ID CallConv,
2600 bool IsShader) const {
2601 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2602 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2603 // Note: user SGPRs are handled by the front-end for graphics shaders
2604 // Pad up the used user SGPRs with dead inputs.
2605
2606 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2607 // before enabling architected SGPRs for workgroup IDs.
2608 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2609
2610 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2611 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2612 // rely on it to reach 16 since if we end up having no stack usage, it will
2613 // not really be added.
2614 unsigned NumRequiredSystemSGPRs =
2615 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2616 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2617 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2618 Register Reg = Info.addReservedUserSGPR();
2619 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 CCInfo.AllocateReg(Reg);
2621 }
2622 }
2623
2624 if (!HasArchitectedSGPRs) {
2625 if (Info.hasWorkGroupIDX()) {
2626 Register Reg = Info.addWorkGroupIDX();
2627 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2628 CCInfo.AllocateReg(Reg);
2629 }
2630
2631 if (Info.hasWorkGroupIDY()) {
2632 Register Reg = Info.addWorkGroupIDY();
2633 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2634 CCInfo.AllocateReg(Reg);
2635 }
2636
2637 if (Info.hasWorkGroupIDZ()) {
2638 Register Reg = Info.addWorkGroupIDZ();
2639 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2640 CCInfo.AllocateReg(Reg);
2641 }
2642 }
2643
2644 if (Info.hasWorkGroupInfo()) {
2645 Register Reg = Info.addWorkGroupInfo();
2646 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2647 CCInfo.AllocateReg(Reg);
2648 }
2649
2650 if (Info.hasPrivateSegmentWaveByteOffset()) {
2651 // Scratch wave offset passed in system SGPR.
2652 unsigned PrivateSegmentWaveByteOffsetReg;
2653
2654 if (IsShader) {
2655 PrivateSegmentWaveByteOffsetReg =
2656 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2657
2658 // This is true if the scratch wave byte offset doesn't have a fixed
2659 // location.
2660 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2661 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2662 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2663 }
2664 } else
2665 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2666
2667 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2668 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2669 }
2670
2671 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2672 Info.getNumPreloadedSGPRs() >= 16);
2673}
2674
2676 MachineFunction &MF,
2677 const SIRegisterInfo &TRI,
2678 SIMachineFunctionInfo &Info) {
2679 // Now that we've figured out where the scratch register inputs are, see if
2680 // should reserve the arguments and use them directly.
2681 MachineFrameInfo &MFI = MF.getFrameInfo();
2682 bool HasStackObjects = MFI.hasStackObjects();
2683 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2684
2685 // Record that we know we have non-spill stack objects so we don't need to
2686 // check all stack objects later.
2687 if (HasStackObjects)
2688 Info.setHasNonSpillStackObjects(true);
2689
2690 // Everything live out of a block is spilled with fast regalloc, so it's
2691 // almost certain that spilling will be required.
2692 if (TM.getOptLevel() == CodeGenOptLevel::None)
2693 HasStackObjects = true;
2694
2695 // For now assume stack access is needed in any callee functions, so we need
2696 // the scratch registers to pass in.
2697 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2698
2699 if (!ST.enableFlatScratch()) {
2700 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2701 // If we have stack objects, we unquestionably need the private buffer
2702 // resource. For the Code Object V2 ABI, this will be the first 4 user
2703 // SGPR inputs. We can reserve those and use them directly.
2704
2705 Register PrivateSegmentBufferReg =
2707 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2708 } else {
2709 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2710 // We tentatively reserve the last registers (skipping the last registers
2711 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2712 // we'll replace these with the ones immediately after those which were
2713 // really allocated. In the prologue copies will be inserted from the
2714 // argument to these reserved registers.
2715
2716 // Without HSA, relocations are used for the scratch pointer and the
2717 // buffer resource setup is always inserted in the prologue. Scratch wave
2718 // offset is still in an input SGPR.
2719 Info.setScratchRSrcReg(ReservedBufferReg);
2720 }
2721 }
2722
2724
2725 // For entry functions we have to set up the stack pointer if we use it,
2726 // whereas non-entry functions get this "for free". This means there is no
2727 // intrinsic advantage to using S32 over S34 in cases where we do not have
2728 // calls but do need a frame pointer (i.e. if we are requested to have one
2729 // because frame pointer elimination is disabled). To keep things simple we
2730 // only ever use S32 as the call ABI stack pointer, and so using it does not
2731 // imply we need a separate frame pointer.
2732 //
2733 // Try to use s32 as the SP, but move it if it would interfere with input
2734 // arguments. This won't work with calls though.
2735 //
2736 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2737 // registers.
2738 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2739 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2740 } else {
2742
2743 if (MFI.hasCalls())
2744 report_fatal_error("call in graphics shader with too many input SGPRs");
2745
2746 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2747 if (!MRI.isLiveIn(Reg)) {
2748 Info.setStackPtrOffsetReg(Reg);
2749 break;
2750 }
2751 }
2752
2753 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2754 report_fatal_error("failed to find register for SP");
2755 }
2756
2757 // hasFP should be accurate for entry functions even before the frame is
2758 // finalized, because it does not rely on the known stack size, only
2759 // properties like whether variable sized objects are present.
2760 if (ST.getFrameLowering()->hasFP(MF)) {
2761 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2762 }
2763}
2764
2767 return !Info->isEntryFunction();
2768}
2769
2771
2773 MachineBasicBlock *Entry,
2774 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2776
2777 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2778 if (!IStart)
2779 return;
2780
2781 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2782 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2783 MachineBasicBlock::iterator MBBI = Entry->begin();
2784 for (const MCPhysReg *I = IStart; *I; ++I) {
2785 const TargetRegisterClass *RC = nullptr;
2786 if (AMDGPU::SReg_64RegClass.contains(*I))
2787 RC = &AMDGPU::SGPR_64RegClass;
2788 else if (AMDGPU::SReg_32RegClass.contains(*I))
2789 RC = &AMDGPU::SGPR_32RegClass;
2790 else
2791 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2792
2793 Register NewVR = MRI->createVirtualRegister(RC);
2794 // Create copy from CSR to a virtual register.
2795 Entry->addLiveIn(*I);
2796 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2797 .addReg(*I);
2798
2799 // Insert the copy-back instructions right before the terminator.
2800 for (auto *Exit : Exits)
2801 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2802 TII->get(TargetOpcode::COPY), *I)
2803 .addReg(NewVR);
2804 }
2805}
2806
2808 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2809 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2810 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2812
2814 const Function &Fn = MF.getFunction();
2817
2818 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2819 DiagnosticInfoUnsupported NoGraphicsHSA(
2820 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2821 DAG.getContext()->diagnose(NoGraphicsHSA);
2822 return DAG.getEntryNode();
2823 }
2824
2827 BitVector Skipped(Ins.size());
2828 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2829 *DAG.getContext());
2830
2831 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2832 bool IsKernel = AMDGPU::isKernel(CallConv);
2833 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2834
2835 if (IsGraphics) {
2836 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2837 assert(!UserSGPRInfo.hasDispatchPtr() &&
2838 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2839 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2840 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2841 (void)UserSGPRInfo;
2842 if (!Subtarget->enableFlatScratch())
2843 assert(!UserSGPRInfo.hasFlatScratchInit());
2844 if ((CallConv != CallingConv::AMDGPU_CS &&
2845 CallConv != CallingConv::AMDGPU_Gfx) ||
2846 !Subtarget->hasArchitectedSGPRs())
2847 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2848 !Info->hasWorkGroupIDZ());
2849 }
2850
2851 if (CallConv == CallingConv::AMDGPU_PS) {
2852 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2853
2854 // At least one interpolation mode must be enabled or else the GPU will
2855 // hang.
2856 //
2857 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2858 // set PSInputAddr, the user wants to enable some bits after the compilation
2859 // based on run-time states. Since we can't know what the final PSInputEna
2860 // will look like, so we shouldn't do anything here and the user should take
2861 // responsibility for the correct programming.
2862 //
2863 // Otherwise, the following restrictions apply:
2864 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2865 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2866 // enabled too.
2867 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2868 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2869 CCInfo.AllocateReg(AMDGPU::VGPR0);
2870 CCInfo.AllocateReg(AMDGPU::VGPR1);
2871 Info->markPSInputAllocated(0);
2872 Info->markPSInputEnabled(0);
2873 }
2874 if (Subtarget->isAmdPalOS()) {
2875 // For isAmdPalOS, the user does not enable some bits after compilation
2876 // based on run-time states; the register values being generated here are
2877 // the final ones set in hardware. Therefore we need to apply the
2878 // workaround to PSInputAddr and PSInputEnable together. (The case where
2879 // a bit is set in PSInputAddr but not PSInputEnable is where the
2880 // frontend set up an input arg for a particular interpolation mode, but
2881 // nothing uses that input arg. Really we should have an earlier pass
2882 // that removes such an arg.)
2883 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2884 if ((PsInputBits & 0x7F) == 0 ||
2885 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2886 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2887 }
2888 } else if (IsKernel) {
2889 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2890 } else {
2891 Splits.append(Ins.begin(), Ins.end());
2892 }
2893
2894 if (IsKernel)
2895 analyzeFormalArgumentsCompute(CCInfo, Ins);
2896
2897 if (IsEntryFunc) {
2898 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2899 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2900 if (IsKernel && Subtarget->hasKernargPreload())
2901 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2902
2903 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2904 } else if (!IsGraphics) {
2905 // For the fixed ABI, pass workitem IDs in the last argument register.
2906 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2907
2908 // FIXME: Sink this into allocateSpecialInputSGPRs
2909 if (!Subtarget->enableFlatScratch())
2910 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2911
2912 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2913 }
2914
2915 if (!IsKernel) {
2916 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2917 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2918 }
2919
2921
2922 // FIXME: This is the minimum kernel argument alignment. We should improve
2923 // this to the maximum alignment of the arguments.
2924 //
2925 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2926 // kern arg offset.
2927 const Align KernelArgBaseAlign = Align(16);
2928
2929 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2930 const ISD::InputArg &Arg = Ins[i];
2931 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2932 InVals.push_back(DAG.getUNDEF(Arg.VT));
2933 continue;
2934 }
2935
2936 CCValAssign &VA = ArgLocs[ArgIdx++];
2937 MVT VT = VA.getLocVT();
2938
2939 if (IsEntryFunc && VA.isMemLoc()) {
2940 VT = Ins[i].VT;
2941 EVT MemVT = VA.getLocVT();
2942
2943 const uint64_t Offset = VA.getLocMemOffset();
2944 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2945
2946 if (Arg.Flags.isByRef()) {
2947 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2948
2949 const GCNTargetMachine &TM =
2950 static_cast<const GCNTargetMachine &>(getTargetMachine());
2951 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2952 Arg.Flags.getPointerAddrSpace())) {
2955 }
2956
2957 InVals.push_back(Ptr);
2958 continue;
2959 }
2960
2961 SDValue NewArg;
2962 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2963 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2964 // In this case the argument is packed into the previous preload SGPR.
2965 int64_t AlignDownOffset = alignDown(Offset, 4);
2966 int64_t OffsetDiff = Offset - AlignDownOffset;
2967 EVT IntVT = MemVT.changeTypeToInteger();
2968
2972 Register Reg =
2973 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2974
2975 assert(Reg);
2976 Register VReg = MRI.getLiveInVirtReg(Reg);
2977 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2978
2979 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2980 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2981
2982 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2983 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2984 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2985 Ins[i].Flags.isSExt(), &Ins[i]);
2986
2987 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2988 } else {
2992 const SmallVectorImpl<MCRegister> &PreloadRegs =
2993 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2994
2995 SDValue Copy;
2996 if (PreloadRegs.size() == 1) {
2997 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2998 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2999 NewArg = DAG.getCopyFromReg(
3000 Chain, DL, VReg,
3002 TRI->getRegSizeInBits(*RC)));
3003
3004 } else {
3005 // If the kernarg alignment does not match the alignment of the SGPR
3006 // tuple RC that can accommodate this argument, it will be built up
3007 // via copies from from the individual SGPRs that the argument was
3008 // preloaded to.
3010 for (auto Reg : PreloadRegs) {
3011 Register VReg = MRI.getLiveInVirtReg(Reg);
3012 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3013 Elts.push_back(Copy);
3014 }
3015 NewArg =
3016 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3017 PreloadRegs.size()),
3018 DL, Elts);
3019 }
3020
3021 // If the argument was preloaded to multiple consecutive 32-bit
3022 // registers because of misalignment between addressable SGPR tuples
3023 // and the argument size, we can still assume that because of kernarg
3024 // segment alignment restrictions that NewArg's size is the same as
3025 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3026 // truncate since we cannot preload to less than a single SGPR and the
3027 // MemVT may be smaller.
3028 EVT MemVTInt =
3030 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3031 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3032
3033 NewArg = DAG.getBitcast(MemVT, NewArg);
3034 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3035 Ins[i].Flags.isSExt(), &Ins[i]);
3036 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3037 }
3038 } else {
3039 // Hidden arguments that are in the kernel signature must be preloaded
3040 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3041 // the argument list and is not preloaded.
3042 if (Arg.isOrigArg()) {
3043 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3044 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3045 DiagnosticInfoUnsupported NonPreloadHiddenArg(
3046 *OrigArg->getParent(),
3047 "hidden argument in kernel signature was not preloaded",
3048 DL.getDebugLoc());
3049 DAG.getContext()->diagnose(NonPreloadHiddenArg);
3050 }
3051 }
3052
3053 NewArg =
3054 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3055 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3056 }
3057 Chains.push_back(NewArg.getValue(1));
3058
3059 auto *ParamTy =
3060 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3062 ParamTy &&
3063 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3064 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3065 // On SI local pointers are just offsets into LDS, so they are always
3066 // less than 16-bits. On CI and newer they could potentially be
3067 // real pointers, so we can't guarantee their size.
3068 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3069 DAG.getValueType(MVT::i16));
3070 }
3071
3072 InVals.push_back(NewArg);
3073 continue;
3074 }
3075 if (!IsEntryFunc && VA.isMemLoc()) {
3076 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3077 InVals.push_back(Val);
3078 if (!Arg.Flags.isByVal())
3079 Chains.push_back(Val.getValue(1));
3080 continue;
3081 }
3082
3083 assert(VA.isRegLoc() && "Parameter must be in a register!");
3084
3085 Register Reg = VA.getLocReg();
3086 const TargetRegisterClass *RC = nullptr;
3087 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3088 RC = &AMDGPU::VGPR_32RegClass;
3089 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3090 RC = &AMDGPU::SGPR_32RegClass;
3091 else
3092 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3093 EVT ValVT = VA.getValVT();
3094
3095 Reg = MF.addLiveIn(Reg, RC);
3096 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3097
3098 if (Arg.Flags.isSRet()) {
3099 // The return object should be reasonably addressable.
3100
3101 // FIXME: This helps when the return is a real sret. If it is a
3102 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3103 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3104 unsigned NumBits =
3106 Val = DAG.getNode(
3107 ISD::AssertZext, DL, VT, Val,
3108 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3109 }
3110
3111 // If this is an 8 or 16-bit value, it is really passed promoted
3112 // to 32 bits. Insert an assert[sz]ext to capture this, then
3113 // truncate to the right size.
3114 switch (VA.getLocInfo()) {
3115 case CCValAssign::Full:
3116 break;
3117 case CCValAssign::BCvt:
3118 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3119 break;
3120 case CCValAssign::SExt:
3121 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3122 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3123 break;
3124 case CCValAssign::ZExt:
3125 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3126 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3127 break;
3128 case CCValAssign::AExt:
3129 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3130 break;
3131 default:
3132 llvm_unreachable("Unknown loc info!");
3133 }
3134
3135 InVals.push_back(Val);
3136 }
3137
3138 // Start adding system SGPRs.
3139 if (IsEntryFunc)
3140 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3141
3142 // DAG.getPass() returns nullptr when using new pass manager.
3143 // TODO: Use DAG.getMFAM() to access analysis result.
3144 if (DAG.getPass()) {
3145 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3146 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3147 }
3148
3149 unsigned StackArgSize = CCInfo.getStackSize();
3150 Info->setBytesInStackArgArea(StackArgSize);
3151
3152 return Chains.empty() ? Chain
3153 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3154}
3155
3156// TODO: If return values can't fit in registers, we should return as many as
3157// possible in registers before passing on stack.
3159 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3160 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3161 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3162 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3163 // for shaders. Vector types should be explicitly handled by CC.
3164 if (AMDGPU::isEntryFunctionCC(CallConv))
3165 return true;
3166
3168 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3169 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3170 return false;
3171
3172 // We must use the stack if return would require unavailable registers.
3173 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3174 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3175 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3176 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3177 return false;
3178
3179 return true;
3180}
3181
3182SDValue
3184 bool isVarArg,
3186 const SmallVectorImpl<SDValue> &OutVals,
3187 const SDLoc &DL, SelectionDAG &DAG) const {
3190
3191 if (AMDGPU::isKernel(CallConv)) {
3192 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3193 OutVals, DL, DAG);
3194 }
3195
3196 bool IsShader = AMDGPU::isShader(CallConv);
3197
3198 Info->setIfReturnsVoid(Outs.empty());
3199 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3200
3201 // CCValAssign - represent the assignment of the return value to a location.
3204
3205 // CCState - Info about the registers and stack slots.
3206 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3207 *DAG.getContext());
3208
3209 // Analyze outgoing return values.
3210 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3211
3212 SDValue Glue;
3214 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3215
3216 // Copy the result values into the output registers.
3217 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3218 ++I, ++RealRVLocIdx) {
3219 CCValAssign &VA = RVLocs[I];
3220 assert(VA.isRegLoc() && "Can only return in registers!");
3221 // TODO: Partially return in registers if return values don't fit.
3222 SDValue Arg = OutVals[RealRVLocIdx];
3223
3224 // Copied from other backends.
3225 switch (VA.getLocInfo()) {
3226 case CCValAssign::Full:
3227 break;
3228 case CCValAssign::BCvt:
3229 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3230 break;
3231 case CCValAssign::SExt:
3232 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3233 break;
3234 case CCValAssign::ZExt:
3235 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3236 break;
3237 case CCValAssign::AExt:
3238 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3239 break;
3240 default:
3241 llvm_unreachable("Unknown loc info!");
3242 }
3243
3244 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3245 Glue = Chain.getValue(1);
3246 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3247 }
3248
3249 // FIXME: Does sret work properly?
3250 if (!Info->isEntryFunction()) {
3251 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3252 const MCPhysReg *I =
3253 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3254 if (I) {
3255 for (; *I; ++I) {
3256 if (AMDGPU::SReg_64RegClass.contains(*I))
3257 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3258 else if (AMDGPU::SReg_32RegClass.contains(*I))
3259 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3260 else
3261 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3262 }
3263 }
3264 }
3265
3266 // Update chain and glue.
3267 RetOps[0] = Chain;
3268 if (Glue.getNode())
3269 RetOps.push_back(Glue);
3270
3271 unsigned Opc = AMDGPUISD::ENDPGM;
3272 if (!IsWaveEnd)
3274 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3275}
3276
3278 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3279 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3280 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3281 SDValue ThisVal) const {
3282 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3283
3284 // Assign locations to each value returned by this call.
3286 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3287 *DAG.getContext());
3288 CCInfo.AnalyzeCallResult(Ins, RetCC);
3289
3290 // Copy all of the result registers out of their specified physreg.
3291 for (CCValAssign VA : RVLocs) {
3292 SDValue Val;
3293
3294 if (VA.isRegLoc()) {
3295 Val =
3296 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3297 Chain = Val.getValue(1);
3298 InGlue = Val.getValue(2);
3299 } else if (VA.isMemLoc()) {
3300 report_fatal_error("TODO: return values in memory");
3301 } else
3302 llvm_unreachable("unknown argument location type");
3303
3304 switch (VA.getLocInfo()) {
3305 case CCValAssign::Full:
3306 break;
3307 case CCValAssign::BCvt:
3308 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3309 break;
3310 case CCValAssign::ZExt:
3311 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3312 DAG.getValueType(VA.getValVT()));
3313 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3314 break;
3315 case CCValAssign::SExt:
3316 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3317 DAG.getValueType(VA.getValVT()));
3318 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3319 break;
3320 case CCValAssign::AExt:
3321 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3322 break;
3323 default:
3324 llvm_unreachable("Unknown loc info!");
3325 }
3326
3327 InVals.push_back(Val);
3328 }
3329
3330 return Chain;
3331}
3332
3333// Add code to pass special inputs required depending on used features separate
3334// from the explicit user arguments present in the IR.
3336 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3337 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3338 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3339 // If we don't have a call site, this was a call inserted by
3340 // legalization. These can never use special inputs.
3341 if (!CLI.CB)
3342 return;
3343
3344 SelectionDAG &DAG = CLI.DAG;
3345 const SDLoc &DL = CLI.DL;
3346 const Function &F = DAG.getMachineFunction().getFunction();
3347
3348 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3349 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3350
3351 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3353 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3354 // DAG.getPass() returns nullptr when using new pass manager.
3355 // TODO: Use DAG.getMFAM() to access analysis result.
3356 if (DAG.getPass()) {
3357 auto &ArgUsageInfo =
3359 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3360 }
3361 }
3362
3363 // TODO: Unify with private memory register handling. This is complicated by
3364 // the fact that at least in kernels, the input argument is not necessarily
3365 // in the same location as the input.
3366 // clang-format off
3367 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3369 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3370 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3371 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3372 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3373 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3374 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3375 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3376 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3377 };
3378 // clang-format on
3379
3380 for (auto [InputID, Attr] : ImplicitAttrs) {
3381 // If the callee does not use the attribute value, skip copying the value.
3382 if (CLI.CB->hasFnAttr(Attr))
3383 continue;
3384
3385 const auto [OutgoingArg, ArgRC, ArgTy] =
3386 CalleeArgInfo->getPreloadedValue(InputID);
3387 if (!OutgoingArg)
3388 continue;
3389
3390 const auto [IncomingArg, IncomingArgRC, Ty] =
3391 CallerArgInfo.getPreloadedValue(InputID);
3392 assert(IncomingArgRC == ArgRC);
3393
3394 // All special arguments are ints for now.
3395 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3396 SDValue InputReg;
3397
3398 if (IncomingArg) {
3399 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3400 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3401 // The implicit arg ptr is special because it doesn't have a corresponding
3402 // input for kernels, and is computed from the kernarg segment pointer.
3403 InputReg = getImplicitArgPtr(DAG, DL);
3404 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3405 std::optional<uint32_t> Id =
3407 if (Id.has_value()) {
3408 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3409 } else {
3410 InputReg = DAG.getUNDEF(ArgVT);
3411 }
3412 } else {
3413 // We may have proven the input wasn't needed, although the ABI is
3414 // requiring it. We just need to allocate the register appropriately.
3415 InputReg = DAG.getUNDEF(ArgVT);
3416 }
3417
3418 if (OutgoingArg->isRegister()) {
3419 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3420 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3421 report_fatal_error("failed to allocate implicit input argument");
3422 } else {
3423 unsigned SpecialArgOffset =
3424 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3425 SDValue ArgStore =
3426 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3427 MemOpChains.push_back(ArgStore);
3428 }
3429 }
3430
3431 // Pack workitem IDs into a single register or pass it as is if already
3432 // packed.
3433
3434 auto [OutgoingArg, ArgRC, Ty] =
3436 if (!OutgoingArg)
3437 std::tie(OutgoingArg, ArgRC, Ty) =
3439 if (!OutgoingArg)
3440 std::tie(OutgoingArg, ArgRC, Ty) =
3442 if (!OutgoingArg)
3443 return;
3444
3445 const ArgDescriptor *IncomingArgX = std::get<0>(
3447 const ArgDescriptor *IncomingArgY = std::get<0>(
3449 const ArgDescriptor *IncomingArgZ = std::get<0>(
3451
3452 SDValue InputReg;
3453 SDLoc SL;
3454
3455 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3456 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3457 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3458
3459 // If incoming ids are not packed we need to pack them.
3460 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3461 NeedWorkItemIDX) {
3462 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3463 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3464 } else {
3465 InputReg = DAG.getConstant(0, DL, MVT::i32);
3466 }
3467 }
3468
3469 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3470 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3471 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3472 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3473 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3474 InputReg = InputReg.getNode()
3475 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3476 : Y;
3477 }
3478
3479 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3480 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3481 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3482 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3483 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3484 InputReg = InputReg.getNode()
3485 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3486 : Z;
3487 }
3488
3489 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3490 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3491 // We're in a situation where the outgoing function requires the workitem
3492 // ID, but the calling function does not have it (e.g a graphics function
3493 // calling a C calling convention function). This is illegal, but we need
3494 // to produce something.
3495 InputReg = DAG.getUNDEF(MVT::i32);
3496 } else {
3497 // Workitem ids are already packed, any of present incoming arguments
3498 // will carry all required fields.
3499 ArgDescriptor IncomingArg =
3500 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3501 : IncomingArgY ? *IncomingArgY
3502 : *IncomingArgZ,
3503 ~0u);
3504 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3505 }
3506 }
3507
3508 if (OutgoingArg->isRegister()) {
3509 if (InputReg)
3510 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3511
3512 CCInfo.AllocateReg(OutgoingArg->getRegister());
3513 } else {
3514 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3515 if (InputReg) {
3516 SDValue ArgStore =
3517 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3518 MemOpChains.push_back(ArgStore);
3519 }
3520 }
3521}
3522
3524 return CC == CallingConv::Fast;
3525}
3526
3527/// Return true if we might ever do TCO for calls with this calling convention.
3529 switch (CC) {
3530 case CallingConv::C:
3532 return true;
3533 default:
3534 return canGuaranteeTCO(CC);
3535 }
3536}
3537
3539 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3541 const SmallVectorImpl<SDValue> &OutVals,
3542 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3543 if (AMDGPU::isChainCC(CalleeCC))
3544 return true;
3545
3546 if (!mayTailCallThisCC(CalleeCC))
3547 return false;
3548
3549 // For a divergent call target, we need to do a waterfall loop over the
3550 // possible callees which precludes us from using a simple jump.
3551 if (Callee->isDivergent())
3552 return false;
3553
3555 const Function &CallerF = MF.getFunction();
3556 CallingConv::ID CallerCC = CallerF.getCallingConv();
3558 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3559
3560 // Kernels aren't callable, and don't have a live in return address so it
3561 // doesn't make sense to do a tail call with entry functions.
3562 if (!CallerPreserved)
3563 return false;
3564
3565 bool CCMatch = CallerCC == CalleeCC;
3566
3568 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3569 return true;
3570 return false;
3571 }
3572
3573 // TODO: Can we handle var args?
3574 if (IsVarArg)
3575 return false;
3576
3577 for (const Argument &Arg : CallerF.args()) {
3578 if (Arg.hasByValAttr())
3579 return false;
3580 }
3581
3582 LLVMContext &Ctx = *DAG.getContext();
3583
3584 // Check that the call results are passed in the same way.
3585 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3586 CCAssignFnForCall(CalleeCC, IsVarArg),
3587 CCAssignFnForCall(CallerCC, IsVarArg)))
3588 return false;
3589
3590 // The callee has to preserve all registers the caller needs to preserve.
3591 if (!CCMatch) {
3592 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3593 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3594 return false;
3595 }
3596
3597 // Nothing more to check if the callee is taking no arguments.
3598 if (Outs.empty())
3599 return true;
3600
3602 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3603
3604 // FIXME: We are not allocating special input registers, so we will be
3605 // deciding based on incorrect register assignments.
3606 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3607
3608 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3609 // If the stack arguments for this call do not fit into our own save area then
3610 // the call cannot be made tail.
3611 // TODO: Is this really necessary?
3612 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3613 return false;
3614
3615 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3616 // FIXME: What about inreg arguments that end up passed in memory?
3617 if (!CCVA.isRegLoc())
3618 continue;
3619
3620 // If we are passing an argument in an SGPR, and the value is divergent,
3621 // this call requires a waterfall loop.
3622 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3623 LLVM_DEBUG(
3624 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3625 << printReg(CCVA.getLocReg(), TRI) << '\n');
3626 return false;
3627 }
3628 }
3629
3630 const MachineRegisterInfo &MRI = MF.getRegInfo();
3631 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3632}
3633
3635 if (!CI->isTailCall())
3636 return false;
3637
3638 const Function *ParentFn = CI->getParent()->getParent();
3640 return false;
3641 return true;
3642}
3643
3644// The wave scratch offset register is used as the global base pointer.
3646 SmallVectorImpl<SDValue> &InVals) const {
3647 CallingConv::ID CallConv = CLI.CallConv;
3648 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3649
3650 SelectionDAG &DAG = CLI.DAG;
3651
3652 TargetLowering::ArgListEntry RequestedExec;
3653 if (IsChainCallConv) {
3654 // The last argument should be the value that we need to put in EXEC.
3655 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3656 // don't treat it like the rest of the arguments.
3657 RequestedExec = CLI.Args.back();
3658 assert(RequestedExec.Node && "No node for EXEC");
3659
3660 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3661 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3662
3663 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3664 CLI.Outs.pop_back();
3665 CLI.OutVals.pop_back();
3666
3667 if (RequestedExec.Ty->isIntegerTy(64)) {
3668 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3669 CLI.Outs.pop_back();
3670 CLI.OutVals.pop_back();
3671 }
3672
3673 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3674 "Haven't popped all the pieces of the EXEC mask");
3675 }
3676
3677 const SDLoc &DL = CLI.DL;
3679 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3681 SDValue Chain = CLI.Chain;
3682 SDValue Callee = CLI.Callee;
3683 bool &IsTailCall = CLI.IsTailCall;
3684 bool IsVarArg = CLI.IsVarArg;
3685 bool IsSibCall = false;
3687
3688 if (Callee.isUndef() || isNullConstant(Callee)) {
3689 if (!CLI.IsTailCall) {
3690 for (ISD::InputArg &Arg : CLI.Ins)
3691 InVals.push_back(DAG.getUNDEF(Arg.VT));
3692 }
3693
3694 return Chain;
3695 }
3696
3697 if (IsVarArg) {
3698 return lowerUnhandledCall(CLI, InVals,
3699 "unsupported call to variadic function ");
3700 }
3701
3702 if (!CLI.CB)
3703 report_fatal_error("unsupported libcall legalization");
3704
3705 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3706 return lowerUnhandledCall(CLI, InVals,
3707 "unsupported required tail call to function ");
3708 }
3709
3710 if (IsTailCall) {
3711 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3712 Outs, OutVals, Ins, DAG);
3713 if (!IsTailCall &&
3714 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3715 report_fatal_error("failed to perform tail call elimination on a call "
3716 "site marked musttail or on llvm.amdgcn.cs.chain");
3717 }
3718
3719 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3720
3721 // A sibling call is one where we're under the usual C ABI and not planning
3722 // to change that but can still do a tail call:
3723 if (!TailCallOpt && IsTailCall)
3724 IsSibCall = true;
3725
3726 if (IsTailCall)
3727 ++NumTailCalls;
3728 }
3729
3732 SmallVector<SDValue, 8> MemOpChains;
3733
3734 // Analyze operands of the call, assigning locations to each operand.
3736 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3737 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3738
3739 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3740 // With a fixed ABI, allocate fixed registers before user arguments.
3741 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3742 }
3743
3744 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3745
3746 // Get a count of how many bytes are to be pushed on the stack.
3747 unsigned NumBytes = CCInfo.getStackSize();
3748
3749 if (IsSibCall) {
3750 // Since we're not changing the ABI to make this a tail call, the memory
3751 // operands are already available in the caller's incoming argument space.
3752 NumBytes = 0;
3753 }
3754
3755 // FPDiff is the byte offset of the call's argument area from the callee's.
3756 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3757 // by this amount for a tail call. In a sibling call it must be 0 because the
3758 // caller will deallocate the entire stack and the callee still expects its
3759 // arguments to begin at SP+0. Completely unused for non-tail calls.
3760 int32_t FPDiff = 0;
3761 MachineFrameInfo &MFI = MF.getFrameInfo();
3762 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3763
3764 // Adjust the stack pointer for the new arguments...
3765 // These operations are automatically eliminated by the prolog/epilog pass
3766 if (!IsSibCall)
3767 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3768
3769 if (!IsSibCall || IsChainCallConv) {
3770 if (!Subtarget->enableFlatScratch()) {
3771 SmallVector<SDValue, 4> CopyFromChains;
3772
3773 // In the HSA case, this should be an identity copy.
3774 SDValue ScratchRSrcReg =
3775 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3776 RegsToPass.emplace_back(IsChainCallConv
3777 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3778 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3779 ScratchRSrcReg);
3780 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3781 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3782 }
3783 }
3784
3785 const unsigned NumSpecialInputs = RegsToPass.size();
3786
3787 MVT PtrVT = MVT::i32;
3788
3789 // Walk the register/memloc assignments, inserting copies/loads.
3790 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3791 CCValAssign &VA = ArgLocs[i];
3792 SDValue Arg = OutVals[i];
3793
3794 // Promote the value if needed.
3795 switch (VA.getLocInfo()) {
3796 case CCValAssign::Full:
3797 break;
3798 case CCValAssign::BCvt:
3799 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3800 break;
3801 case CCValAssign::ZExt:
3802 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3803 break;
3804 case CCValAssign::SExt:
3805 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3806 break;
3807 case CCValAssign::AExt:
3808 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3809 break;
3810 case CCValAssign::FPExt:
3811 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3812 break;
3813 default:
3814 llvm_unreachable("Unknown loc info!");
3815 }
3816
3817 if (VA.isRegLoc()) {
3818 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3819 } else {
3820 assert(VA.isMemLoc());
3821
3822 SDValue DstAddr;
3823 MachinePointerInfo DstInfo;
3824
3825 unsigned LocMemOffset = VA.getLocMemOffset();
3826 int32_t Offset = LocMemOffset;
3827
3828 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3829 MaybeAlign Alignment;
3830
3831 if (IsTailCall) {
3832 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3833 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3834 : VA.getValVT().getStoreSize();
3835
3836 // FIXME: We can have better than the minimum byval required alignment.
3837 Alignment =
3838 Flags.isByVal()
3839 ? Flags.getNonZeroByValAlign()
3840 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3841
3842 Offset = Offset + FPDiff;
3843 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3844
3845 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3846 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3847
3848 // Make sure any stack arguments overlapping with where we're storing
3849 // are loaded before this eventual operation. Otherwise they'll be
3850 // clobbered.
3851
3852 // FIXME: Why is this really necessary? This seems to just result in a
3853 // lot of code to copy the stack and write them back to the same
3854 // locations, which are supposed to be immutable?
3855 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3856 } else {
3857 // Stores to the argument stack area are relative to the stack pointer.
3858 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3859 MVT::i32);
3860 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3861 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3862 Alignment =
3863 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3864 }
3865
3866 if (Outs[i].Flags.isByVal()) {
3867 SDValue SizeNode =
3868 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3869 SDValue Cpy =
3870 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3871 Outs[i].Flags.getNonZeroByValAlign(),
3872 /*isVol = */ false, /*AlwaysInline = */ true,
3873 /*CI=*/nullptr, std::nullopt, DstInfo,
3875
3876 MemOpChains.push_back(Cpy);
3877 } else {
3878 SDValue Store =
3879 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3880 MemOpChains.push_back(Store);
3881 }
3882 }
3883 }
3884
3885 if (!MemOpChains.empty())
3886 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3887
3888 SDValue ReadFirstLaneID =
3889 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3890
3891 SDValue TokenGlue;
3892 if (CLI.ConvergenceControlToken) {
3893 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3895 }
3896
3897 // Build a sequence of copy-to-reg nodes chained together with token chain
3898 // and flag operands which copy the outgoing args into the appropriate regs.
3899 SDValue InGlue;
3900
3901 unsigned ArgIdx = 0;
3902 for (auto [Reg, Val] : RegsToPass) {
3903 if (ArgIdx++ >= NumSpecialInputs &&
3904 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3905 // For chain calls, the inreg arguments are required to be
3906 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3907 // they are uniform.
3908 //
3909 // For other calls, if an inreg arguments is known to be uniform,
3910 // speculatively insert a readfirstlane in case it is in a VGPR.
3911 //
3912 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3913 // value, so let that continue to produce invalid code.
3914
3915 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3916 if (TokenGlue)
3917 ReadfirstlaneArgs.push_back(TokenGlue);
3919 ReadfirstlaneArgs);
3920 }
3921
3922 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3923 InGlue = Chain.getValue(1);
3924 }
3925
3926 // We don't usually want to end the call-sequence here because we would tidy
3927 // the frame up *after* the call, however in the ABI-changing tail-call case
3928 // we've carefully laid out the parameters so that when sp is reset they'll be
3929 // in the correct location.
3930 if (IsTailCall && !IsSibCall) {
3931 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3932 InGlue = Chain.getValue(1);
3933 }
3934
3935 std::vector<SDValue> Ops({Chain});
3936
3937 // Add a redundant copy of the callee global which will not be legalized, as
3938 // we need direct access to the callee later.
3939 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3940 const GlobalValue *GV = GSD->getGlobal();
3941 Ops.push_back(Callee);
3942 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3943 } else {
3944 if (IsTailCall) {
3945 // isEligibleForTailCallOptimization considered whether the call target is
3946 // divergent, but we may still end up with a uniform value in a VGPR.
3947 // Insert a readfirstlane just in case.
3948 SDValue ReadFirstLaneID =
3949 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3950
3951 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3952 if (TokenGlue)
3953 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3954 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3955 ReadfirstlaneArgs);
3956 }
3957
3958 Ops.push_back(Callee);
3959 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3960 }
3961
3962 if (IsTailCall) {
3963 // Each tail call may have to adjust the stack by a different amount, so
3964 // this information must travel along with the operation for eventual
3965 // consumption by emitEpilogue.
3966 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3967 }
3968
3969 if (IsChainCallConv)
3970 Ops.push_back(RequestedExec.Node);
3971
3972 // Add argument registers to the end of the list so that they are known live
3973 // into the call.
3974 for (auto &[Reg, Val] : RegsToPass)
3975 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3976
3977 // Add a register mask operand representing the call-preserved registers.
3978 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3979 assert(Mask && "Missing call preserved mask for calling convention");
3980 Ops.push_back(DAG.getRegisterMask(Mask));
3981
3982 if (SDValue Token = CLI.ConvergenceControlToken) {
3984 GlueOps.push_back(Token);
3985 if (InGlue)
3986 GlueOps.push_back(InGlue);
3987
3988 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3989 MVT::Glue, GlueOps),
3990 0);
3991 }
3992
3993 if (InGlue)
3994 Ops.push_back(InGlue);
3995
3996 // If we're doing a tall call, use a TC_RETURN here rather than an
3997 // actual call instruction.
3998 if (IsTailCall) {
3999 MFI.setHasTailCall();
4000 unsigned OPC = AMDGPUISD::TC_RETURN;
4001 switch (CallConv) {
4004 break;
4008 break;
4009 }
4010
4011 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4012 }
4013
4014 // Returns a chain and a flag for retval copy to use.
4015 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4016 Chain = Call.getValue(0);
4017 InGlue = Call.getValue(1);
4018
4019 uint64_t CalleePopBytes = NumBytes;
4020 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4021 if (!Ins.empty())
4022 InGlue = Chain.getValue(1);
4023
4024 // Handle result values, copying them out of physregs into vregs that we
4025 // return.
4026 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4027 InVals, /*IsThisReturn=*/false, SDValue());
4028}
4029
4030// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4031// except for:
4032// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4033// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4035 SelectionDAG &DAG) const {
4036 const MachineFunction &MF = DAG.getMachineFunction();
4038
4039 SDLoc dl(Op);
4040 EVT VT = Op.getValueType();
4041 SDValue Chain = Op.getOperand(0);
4042 Register SPReg = Info->getStackPtrOffsetReg();
4043
4044 // Chain the dynamic stack allocation so that it doesn't modify the stack
4045 // pointer when other instructions are using the stack.
4046 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4047
4048 SDValue Size = Op.getOperand(1);
4049 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4050 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4051
4052 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4054 "Stack grows upwards for AMDGPU");
4055
4056 Chain = BaseAddr.getValue(1);
4057 Align StackAlign = TFL->getStackAlign();
4058 if (Alignment > StackAlign) {
4059 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4060 << Subtarget->getWavefrontSizeLog2();
4061 uint64_t StackAlignMask = ScaledAlignment - 1;
4062 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4063 DAG.getConstant(StackAlignMask, dl, VT));
4064 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4065 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4066 }
4067
4068 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4069 SDValue NewSP;
4070 if (isa<ConstantSDNode>(Size)) {
4071 // For constant sized alloca, scale alloca size by wave-size
4072 SDValue ScaledSize = DAG.getNode(
4073 ISD::SHL, dl, VT, Size,
4074 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4075 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4076 } else {
4077 // For dynamic sized alloca, perform wave-wide reduction to get max of
4078 // alloca size(divergent) and then scale it by wave-size
4079 SDValue WaveReduction =
4080 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4081 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4082 Size, DAG.getConstant(0, dl, MVT::i32));
4083 SDValue ScaledSize = DAG.getNode(
4084 ISD::SHL, dl, VT, Size,
4085 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4086 NewSP =
4087 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4088 SDValue ReadFirstLaneID =
4089 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4090 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4091 NewSP);
4092 }
4093
4094 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4095 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4096
4097 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4098}
4099
4101 if (Op.getValueType() != MVT::i32)
4102 return Op; // Defer to cannot select error.
4103
4105 SDLoc SL(Op);
4106
4107 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4108
4109 // Convert from wave uniform to swizzled vector address. This should protect
4110 // from any edge cases where the stacksave result isn't directly used with
4111 // stackrestore.
4112 SDValue VectorAddress =
4113 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4114 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4115}
4116
4118 SelectionDAG &DAG) const {
4119 SDLoc SL(Op);
4120 assert(Op.getValueType() == MVT::i32);
4121
4122 uint32_t BothRoundHwReg =
4124 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4125
4126 SDValue IntrinID =
4127 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4128 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4129 Op.getOperand(0), IntrinID, GetRoundBothImm);
4130
4131 // There are two rounding modes, one for f32 and one for f64/f16. We only
4132 // report in the standard value range if both are the same.
4133 //
4134 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4135 // ties away from zero is not supported, and the other values are rotated by
4136 // 1.
4137 //
4138 // If the two rounding modes are not the same, report a target defined value.
4139
4140 // Mode register rounding mode fields:
4141 //
4142 // [1:0] Single-precision round mode.
4143 // [3:2] Double/Half-precision round mode.
4144 //
4145 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4146 //
4147 // Hardware Spec
4148 // Toward-0 3 0
4149 // Nearest Even 0 1
4150 // +Inf 1 2
4151 // -Inf 2 3
4152 // NearestAway0 N/A 4
4153 //
4154 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4155 // table we can index by the raw hardware mode.
4156 //
4157 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4158
4159 SDValue BitTable =
4161
4162 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4163 SDValue RoundModeTimesNumBits =
4164 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4165
4166 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4167 // knew only one mode was demanded.
4168 SDValue TableValue =
4169 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4170 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4171
4172 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4173 SDValue TableEntry =
4174 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4175
4176 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4177 // if it's an extended value.
4178 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4179 SDValue IsStandardValue =
4180 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4181 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4182 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4183 TableEntry, EnumOffset);
4184
4185 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4186}
4187
4189 SelectionDAG &DAG) const {
4190 SDLoc SL(Op);
4191
4192 SDValue NewMode = Op.getOperand(1);
4193 assert(NewMode.getValueType() == MVT::i32);
4194
4195 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4196 // hardware MODE.fp_round values.
4197 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4198 uint32_t ClampedVal = std::min(
4199 static_cast<uint32_t>(ConstMode->getZExtValue()),
4201 NewMode = DAG.getConstant(
4202 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4203 } else {
4204 // If we know the input can only be one of the supported standard modes in
4205 // the range 0-3, we can use a simplified mapping to hardware values.
4206 KnownBits KB = DAG.computeKnownBits(NewMode);
4207 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4208 // The supported standard values are 0-3. The extended values start at 8. We
4209 // need to offset by 4 if the value is in the extended range.
4210
4211 if (UseReducedTable) {
4212 // Truncate to the low 32-bits.
4213 SDValue BitTable = DAG.getConstant(
4214 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4215
4216 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4217 SDValue RoundModeTimesNumBits =
4218 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4219
4220 NewMode =
4221 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4222
4223 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4224 // the table extracted bits into inline immediates.
4225 } else {
4226 // table_index = umin(value, value - 4)
4227 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4228 SDValue BitTable =
4230
4231 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4232 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4233 SDValue IndexVal =
4234 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4235
4236 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4237 SDValue RoundModeTimesNumBits =
4238 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4239
4240 SDValue TableValue =
4241 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4242 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4243
4244 // No need to mask out the high bits since the setreg will ignore them
4245 // anyway.
4246 NewMode = TruncTable;
4247 }
4248
4249 // Insert a readfirstlane in case the value is a VGPR. We could do this
4250 // earlier and keep more operations scalar, but that interferes with
4251 // combining the source.
4252 SDValue ReadFirstLaneID =
4253 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4254 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4255 ReadFirstLaneID, NewMode);
4256 }
4257
4258 // N.B. The setreg will be later folded into s_round_mode on supported
4259 // targets.
4260 SDValue IntrinID =
4261 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4262 uint32_t BothRoundHwReg =
4264 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4265
4266 SDValue SetReg =
4267 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4268 IntrinID, RoundBothImm, NewMode);
4269
4270 return SetReg;
4271}
4272
4274 if (Op->isDivergent())
4275 return SDValue();
4276
4277 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4282 break;
4283 default:
4284 return SDValue();
4285 }
4286
4287 return Op;
4288}
4289
4290// Work around DAG legality rules only based on the result type.
4292 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4293 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4294 EVT SrcVT = Src.getValueType();
4295
4296 if (SrcVT.getScalarType() != MVT::bf16)
4297 return Op;
4298
4299 SDLoc SL(Op);
4300 SDValue BitCast =
4301 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4302
4303 EVT DstVT = Op.getValueType();
4304 if (IsStrict)
4305 llvm_unreachable("Need STRICT_BF16_TO_FP");
4306
4307 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4308}
4309
4311 SDLoc SL(Op);
4312 if (Op.getValueType() != MVT::i64)
4313 return Op;
4314
4315 uint32_t ModeHwReg =
4317 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4318 uint32_t TrapHwReg =
4320 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4321
4322 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4323 SDValue IntrinID =
4324 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4325 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4326 Op.getOperand(0), IntrinID, ModeHwRegImm);
4327 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4328 Op.getOperand(0), IntrinID, TrapHwRegImm);
4329 SDValue TokenReg =
4330 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4331 GetTrapReg.getValue(1));
4332
4333 SDValue CvtPtr =
4334 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4335 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4336
4337 return DAG.getMergeValues({Result, TokenReg}, SL);
4338}
4339
4341 SDLoc SL(Op);
4342 if (Op.getOperand(1).getValueType() != MVT::i64)
4343 return Op;
4344
4345 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4346 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4347 DAG.getConstant(0, SL, MVT::i32));
4348 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4349 DAG.getConstant(1, SL, MVT::i32));
4350
4351 SDValue ReadFirstLaneID =
4352 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4353 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4354 ReadFirstLaneID, NewModeReg);
4355 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4356 ReadFirstLaneID, NewTrapReg);
4357
4358 unsigned ModeHwReg =
4360 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4361 unsigned TrapHwReg =
4363 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4364
4365 SDValue IntrinID =
4366 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4367 SDValue SetModeReg =
4368 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4369 IntrinID, ModeHwRegImm, NewModeReg);
4370 SDValue SetTrapReg =
4371 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4372 IntrinID, TrapHwRegImm, NewTrapReg);
4373 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4374}
4375
4377 const MachineFunction &MF) const {
4379 .Case("m0", AMDGPU::M0)
4380 .Case("exec", AMDGPU::EXEC)
4381 .Case("exec_lo", AMDGPU::EXEC_LO)
4382 .Case("exec_hi", AMDGPU::EXEC_HI)
4383 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4384 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4385 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4386 .Default(Register());
4387
4388 if (Reg == AMDGPU::NoRegister) {
4390 Twine("invalid register name \"" + StringRef(RegName) + "\"."));
4391 }
4392
4393 if (!Subtarget->hasFlatScrRegister() &&
4394 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4395 report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
4396 "\" for subtarget."));
4397 }
4398
4399 switch (Reg) {
4400 case AMDGPU::M0:
4401 case AMDGPU::EXEC_LO:
4402 case AMDGPU::EXEC_HI:
4403 case AMDGPU::FLAT_SCR_LO:
4404 case AMDGPU::FLAT_SCR_HI:
4405 if (VT.getSizeInBits() == 32)
4406 return Reg;
4407 break;
4408 case AMDGPU::EXEC:
4409 case AMDGPU::FLAT_SCR:
4410 if (VT.getSizeInBits() == 64)
4411 return Reg;
4412 break;
4413 default:
4414 llvm_unreachable("missing register type checking");
4415 }
4416
4418 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4419}
4420
4421// If kill is not the last instruction, split the block so kill is always a
4422// proper terminator.
4425 MachineBasicBlock *BB) const {
4426 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4428 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4429 return SplitBB;
4430}
4431
4432// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4433// \p MI will be the only instruction in the loop body block. Otherwise, it will
4434// be the first instruction in the remainder block.
4435//
4436/// \returns { LoopBody, Remainder }
4437static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4441
4442 // To insert the loop we need to split the block. Move everything after this
4443 // point to a new block, and insert a new empty block between the two.
4445 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4447 ++MBBI;
4448
4449 MF->insert(MBBI, LoopBB);
4450 MF->insert(MBBI, RemainderBB);
4451
4452 LoopBB->addSuccessor(LoopBB);
4453 LoopBB->addSuccessor(RemainderBB);
4454
4455 // Move the rest of the block into a new block.
4456 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4457
4458 if (InstInLoop) {
4459 auto Next = std::next(I);
4460
4461 // Move instruction to loop body.
4462 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4463
4464 // Move the rest of the block.
4465 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4466 } else {
4467 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4468 }
4469
4470 MBB.addSuccessor(LoopBB);
4471
4472 return std::pair(LoopBB, RemainderBB);
4473}
4474
4475/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4477 MachineBasicBlock *MBB = MI.getParent();
4479 auto I = MI.getIterator();
4480 auto E = std::next(I);
4481
4482 // clang-format off
4483 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4484 .addImm(0);
4485 // clang-format on
4486
4487 MIBundleBuilder Bundler(*MBB, I, E);
4488 finalizeBundle(*MBB, Bundler.begin());
4489}
4490
4493 MachineBasicBlock *BB) const {
4494 const DebugLoc &DL = MI.getDebugLoc();
4495
4497
4499
4500 // Apparently kill flags are only valid if the def is in the same block?
4501 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4502 Src->setIsKill(false);
4503
4504 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4505
4506 MachineBasicBlock::iterator I = LoopBB->end();
4507
4508 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4510
4511 // Clear TRAP_STS.MEM_VIOL
4512 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4513 .addImm(0)
4514 .addImm(EncodedReg);
4515
4517
4518 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4519
4520 // Load and check TRAP_STS.MEM_VIOL
4521 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4522 .addImm(EncodedReg);
4523
4524 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4525 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4526 .addReg(Reg, RegState::Kill)
4527 .addImm(0);
4528 // clang-format off
4529 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4530 .addMBB(LoopBB);
4531 // clang-format on
4532
4533 return RemainderBB;
4534}
4535
4536// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4537// wavefront. If the value is uniform and just happens to be in a VGPR, this
4538// will only do one iteration. In the worst case, this will loop 64 times.
4539//
4540// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4543 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4544 const DebugLoc &DL, const MachineOperand &Idx,
4545 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4546 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4547 Register &SGPRIdxReg) {
4548
4549 MachineFunction *MF = OrigBB.getParent();
4550 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4551 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4553
4554 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4555 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4556 Register NewExec = MRI.createVirtualRegister(BoolRC);
4557 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4558 Register CondReg = MRI.createVirtualRegister(BoolRC);
4559
4560 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4561 .addReg(InitReg)
4562 .addMBB(&OrigBB)
4563 .addReg(ResultReg)
4564 .addMBB(&LoopBB);
4565
4566 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4567 .addReg(InitSaveExecReg)
4568 .addMBB(&OrigBB)
4569 .addReg(NewExec)
4570 .addMBB(&LoopBB);
4571
4572 // Read the next variant <- also loop target.
4573 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4574 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4575
4576 // Compare the just read M0 value to all possible Idx values.
4577 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4578 .addReg(CurrentIdxReg)
4579 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4580
4581 // Update EXEC, save the original EXEC value to VCC.
4582 BuildMI(LoopBB, I, DL,
4583 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4584 : AMDGPU::S_AND_SAVEEXEC_B64),
4585 NewExec)
4586 .addReg(CondReg, RegState::Kill);
4587
4588 MRI.setSimpleHint(NewExec, CondReg);
4589
4590 if (UseGPRIdxMode) {
4591 if (Offset == 0) {
4592 SGPRIdxReg = CurrentIdxReg;
4593 } else {
4594 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4595 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4596 .addReg(CurrentIdxReg, RegState::Kill)
4597 .addImm(Offset);
4598 }
4599 } else {
4600 // Move index from VCC into M0
4601 if (Offset == 0) {
4602 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4603 .addReg(CurrentIdxReg, RegState::Kill);
4604 } else {
4605 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4606 .addReg(CurrentIdxReg, RegState::Kill)
4607 .addImm(Offset);
4608 }
4609 }
4610
4611 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4612 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4613 MachineInstr *InsertPt =
4614 BuildMI(LoopBB, I, DL,
4615 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4616 : AMDGPU::S_XOR_B64_term),
4617 Exec)
4618 .addReg(Exec)
4619 .addReg(NewExec);
4620
4621 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4622 // s_cbranch_scc0?
4623
4624 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4625 // clang-format off
4626 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4627 .addMBB(&LoopBB);
4628 // clang-format on
4629
4630 return InsertPt->getIterator();
4631}
4632
4633// This has slightly sub-optimal regalloc when the source vector is killed by
4634// the read. The register allocator does not understand that the kill is
4635// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4636// subregister from it, using 1 more VGPR than necessary. This was saved when
4637// this was expanded after register allocation.
4640 unsigned InitResultReg, unsigned PhiReg, int Offset,
4641 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4643 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4644 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4646 const DebugLoc &DL = MI.getDebugLoc();
4648
4649 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4650 Register DstReg = MI.getOperand(0).getReg();
4651 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4652 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4653 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4654 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4655
4656 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4657
4658 // Save the EXEC mask
4659 // clang-format off
4660 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4661 .addReg(Exec);
4662 // clang-format on
4663
4664 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4665
4666 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4667
4668 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4669 InitResultReg, DstReg, PhiReg, TmpExec,
4670 Offset, UseGPRIdxMode, SGPRIdxReg);
4671
4672 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4674 ++MBBI;
4675 MF->insert(MBBI, LandingPad);
4676 LoopBB->removeSuccessor(RemainderBB);
4677 LandingPad->addSuccessor(RemainderBB);
4678 LoopBB->addSuccessor(LandingPad);
4679 MachineBasicBlock::iterator First = LandingPad->begin();
4680 // clang-format off
4681 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4682 .addReg(SaveExec);
4683 // clang-format on
4684
4685 return InsPt;
4686}
4687
4688// Returns subreg index, offset
4689static std::pair<unsigned, int>
4691 const TargetRegisterClass *SuperRC, unsigned VecReg,
4692 int Offset) {
4693 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4694
4695 // Skip out of bounds offsets, or else we would end up using an undefined
4696 // register.
4697 if (Offset >= NumElts || Offset < 0)
4698 return std::pair(AMDGPU::sub0, Offset);
4699
4700 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4701}
4702
4705 int Offset) {
4706 MachineBasicBlock *MBB = MI.getParent();
4707 const DebugLoc &DL = MI.getDebugLoc();
4709
4710 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4711
4712 assert(Idx->getReg() != AMDGPU::NoRegister);
4713
4714 if (Offset == 0) {
4715 // clang-format off
4716 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4717 .add(*Idx);
4718 // clang-format on
4719 } else {
4720 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4721 .add(*Idx)
4722 .addImm(Offset);
4723 }
4724}
4725
4728 int Offset) {
4729 MachineBasicBlock *MBB = MI.getParent();
4730 const DebugLoc &DL = MI.getDebugLoc();
4732
4733 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4734
4735 if (Offset == 0)
4736 return Idx->getReg();
4737
4738 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4739 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4740 .add(*Idx)
4741 .addImm(Offset);
4742 return Tmp;
4743}
4744
4747 const GCNSubtarget &ST) {
4748 const SIInstrInfo *TII = ST.getInstrInfo();
4749 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4752
4753 Register Dst = MI.getOperand(0).getReg();
4754 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4755 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4756 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4757
4758 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4759 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4760
4761 unsigned SubReg;
4762 std::tie(SubReg, Offset) =
4763 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4764
4765 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4766
4767 // Check for a SGPR index.
4768 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4770 const DebugLoc &DL = MI.getDebugLoc();
4771
4772 if (UseGPRIdxMode) {
4773 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4774 // to avoid interfering with other uses, so probably requires a new
4775 // optimization pass.
4777
4778 const MCInstrDesc &GPRIDXDesc =
4779 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4780 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4781 .addReg(SrcReg)
4782 .addReg(Idx)
4783 .addImm(SubReg);
4784 } else {
4786
4787 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4788 .addReg(SrcReg, 0, SubReg)
4789 .addReg(SrcReg, RegState::Implicit);
4790 }
4791
4792 MI.eraseFromParent();
4793
4794 return &MBB;
4795 }
4796
4797 // Control flow needs to be inserted if indexing with a VGPR.
4798 const DebugLoc &DL = MI.getDebugLoc();
4800
4801 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4802 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4803
4804 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4805
4806 Register SGPRIdxReg;
4807 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4808 UseGPRIdxMode, SGPRIdxReg);
4809
4810 MachineBasicBlock *LoopBB = InsPt->getParent();
4811
4812 if (UseGPRIdxMode) {
4813 const MCInstrDesc &GPRIDXDesc =
4814 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4815
4816 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4817 .addReg(SrcReg)
4818 .addReg(SGPRIdxReg)
4819 .addImm(SubReg);
4820 } else {
4821 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4822 .addReg(SrcReg, 0, SubReg)
4823 .addReg(SrcReg, RegState::Implicit);
4824 }
4825
4826 MI.eraseFromParent();
4827
4828 return LoopBB;
4829}
4830
4833 const GCNSubtarget &ST) {
4834 const SIInstrInfo *TII = ST.getInstrInfo();
4835 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4838
4839 Register Dst = MI.getOperand(0).getReg();
4840 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4841 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4842 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4843 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4844 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4845 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4846
4847 // This can be an immediate, but will be folded later.
4848 assert(Val->getReg());
4849
4850 unsigned SubReg;
4851 std::tie(SubReg, Offset) =
4852 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4853 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4854
4855 if (Idx->getReg() == AMDGPU::NoRegister) {
4857 const DebugLoc &DL = MI.getDebugLoc();
4858
4859 assert(Offset == 0);
4860
4861 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4862 .add(*SrcVec)
4863 .add(*Val)
4864 .addImm(SubReg);
4865
4866 MI.eraseFromParent();
4867 return &MBB;
4868 }
4869
4870 // Check for a SGPR index.
4871 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4873 const DebugLoc &DL = MI.getDebugLoc();
4874
4875 if (UseGPRIdxMode) {
4877
4878 const MCInstrDesc &GPRIDXDesc =
4879 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4880 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4881 .addReg(SrcVec->getReg())
4882 .add(*Val)
4883 .addReg(Idx)
4884 .addImm(SubReg);
4885 } else {
4887
4888 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4889 TRI.getRegSizeInBits(*VecRC), 32, false);
4890 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4891 .addReg(SrcVec->getReg())
4892 .add(*Val)
4893 .addImm(SubReg);
4894 }
4895 MI.eraseFromParent();
4896 return &MBB;
4897 }
4898
4899 // Control flow needs to be inserted if indexing with a VGPR.
4900 if (Val->isReg())
4901 MRI.clearKillFlags(Val->getReg());
4902
4903 const DebugLoc &DL = MI.getDebugLoc();
4904
4905 Register PhiReg = MRI.createVirtualRegister(VecRC);
4906
4907 Register SGPRIdxReg;
4908 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4909 UseGPRIdxMode, SGPRIdxReg);
4910 MachineBasicBlock *LoopBB = InsPt->getParent();
4911
4912 if (UseGPRIdxMode) {
4913 const MCInstrDesc &GPRIDXDesc =
4914 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4915
4916 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4917 .addReg(PhiReg)
4918 .add(*Val)
4919 .addReg(SGPRIdxReg)
4920 .addImm(SubReg);
4921 } else {
4922 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4923 TRI.getRegSizeInBits(*VecRC), 32, false);
4924 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4925 .addReg(PhiReg)
4926 .add(*Val)
4927 .addImm(SubReg);
4928 }
4929
4930 MI.eraseFromParent();
4931 return LoopBB;
4932}
4933
4936 const GCNSubtarget &ST,
4937 unsigned Opc) {
4939 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4940 const DebugLoc &DL = MI.getDebugLoc();
4941 const SIInstrInfo *TII = ST.getInstrInfo();
4942
4943 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4944 Register SrcReg = MI.getOperand(1).getReg();
4945 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4946 Register DstReg = MI.getOperand(0).getReg();
4947 MachineBasicBlock *RetBB = nullptr;
4948 if (isSGPR) {
4949 // These operations with a uniform value i.e. SGPR are idempotent.
4950 // Reduced value will be same as given sgpr.
4951 // clang-format off
4952 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4953 .addReg(SrcReg);
4954 // clang-format on
4955 RetBB = &BB;
4956 } else {
4957 // TODO: Implement DPP Strategy and switch based on immediate strategy
4958 // operand. For now, for all the cases (default, Iterative and DPP we use
4959 // iterative approach by default.)
4960
4961 // To reduce the VGPR using iterative approach, we need to iterate
4962 // over all the active lanes. Lowering consists of ComputeLoop,
4963 // which iterate over only active lanes. We use copy of EXEC register
4964 // as induction variable and every active lane modifies it using bitset0
4965 // so that we will get the next active lane for next iteration.
4967 Register SrcReg = MI.getOperand(1).getReg();
4968
4969 // Create Control flow for loop
4970 // Split MI's Machine Basic block into For loop
4971 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4972
4973 // Create virtual registers required for lowering.
4974 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4975 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4976 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4977 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4978
4979 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4980 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4981 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4982
4983 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4984 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4985
4986 bool IsWave32 = ST.isWave32();
4987 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4988 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4989
4990 // Create initail values of induction variable from Exec, Accumulator and
4991 // insert branch instr to newly created ComputeBlockk
4992 uint32_t InitalValue =
4993 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4994 auto TmpSReg =
4995 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4996 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4997 .addImm(InitalValue);
4998 // clang-format off
4999 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5000 .addMBB(ComputeLoop);
5001 // clang-format on
5002
5003 // Start constructing ComputeLoop
5004 I = ComputeLoop->end();
5005 auto Accumulator =
5006 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5007 .addReg(InitalValReg)
5008 .addMBB(&BB);
5009 auto ActiveBits =
5010 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5011 .addReg(TmpSReg->getOperand(0).getReg())
5012 .addMBB(&BB);
5013
5014 // Perform the computations
5015 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5016 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5017 .addReg(ActiveBits->getOperand(0).getReg());
5018 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5019 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5020 .addReg(SrcReg)
5021 .addReg(FF1->getOperand(0).getReg());
5022 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5023 .addReg(Accumulator->getOperand(0).getReg())
5024 .addReg(LaneValue->getOperand(0).getReg());
5025
5026 // Manipulate the iterator to get the next active lane
5027 unsigned BITSETOpc =
5028 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5029 auto NewActiveBits =
5030 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5031 .addReg(FF1->getOperand(0).getReg())
5032 .addReg(ActiveBits->getOperand(0).getReg());
5033
5034 // Add phi nodes
5035 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5036 .addMBB(ComputeLoop);
5037 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5038 .addMBB(ComputeLoop);
5039
5040 // Creating branching
5041 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5042 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5043 .addReg(NewActiveBits->getOperand(0).getReg())
5044 .addImm(0);
5045 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5046 .addMBB(ComputeLoop);
5047
5048 RetBB = ComputeEnd;
5049 }
5050 MI.eraseFromParent();
5051 return RetBB;
5052}
5053
5056 MachineBasicBlock *BB) const {
5057
5059 MachineFunction *MF = BB->getParent();
5061
5062 switch (MI.getOpcode()) {
5063 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5064 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5065 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5066 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5067 case AMDGPU::S_UADDO_PSEUDO:
5068 case AMDGPU::S_USUBO_PSEUDO: {
5069 const DebugLoc &DL = MI.getDebugLoc();
5070 MachineOperand &Dest0 = MI.getOperand(0);
5071 MachineOperand &Dest1 = MI.getOperand(1);
5072 MachineOperand &Src0 = MI.getOperand(2);
5073 MachineOperand &Src1 = MI.getOperand(3);
5074
5075 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5076 ? AMDGPU::S_ADD_I32
5077 : AMDGPU::S_SUB_I32;
5078 // clang-format off
5079 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5080 .add(Src0)
5081 .add(Src1);
5082 // clang-format on
5083
5084 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5085 .addImm(1)
5086 .addImm(0);
5087
5088 MI.eraseFromParent();
5089 return BB;
5090 }
5091 case AMDGPU::S_ADD_U64_PSEUDO:
5092 case AMDGPU::S_SUB_U64_PSEUDO: {
5093 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5094 // For GFX12, we emit s_add_u64 and s_sub_u64.
5095 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5097 const DebugLoc &DL = MI.getDebugLoc();
5098 MachineOperand &Dest = MI.getOperand(0);
5099 MachineOperand &Src0 = MI.getOperand(1);
5100 MachineOperand &Src1 = MI.getOperand(2);
5101 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5102 if (Subtarget->hasScalarAddSub64()) {
5103 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5104 // clang-format off
5105 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5106 .add(Src0)
5107 .add(Src1);
5108 // clang-format on
5109 } else {
5110 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5111 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5112
5113 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5114 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5115
5116 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5117 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5118 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5119 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5120
5121 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5122 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5123 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5124 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5125
5126 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5127 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5128 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5129 .add(Src0Sub0)
5130 .add(Src1Sub0);
5131 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5132 .add(Src0Sub1)
5133 .add(Src1Sub1);
5134 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5135 .addReg(DestSub0)
5136 .addImm(AMDGPU::sub0)
5137 .addReg(DestSub1)
5138 .addImm(AMDGPU::sub1);
5139 }
5140 MI.eraseFromParent();
5141 return BB;
5142 }
5143 case AMDGPU::V_ADD_U64_PSEUDO:
5144 case AMDGPU::V_SUB_U64_PSEUDO: {
5146 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5147 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5148 const DebugLoc &DL = MI.getDebugLoc();
5149
5150 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5151
5152 MachineOperand &Dest = MI.getOperand(0);
5153 MachineOperand &Src0 = MI.getOperand(1);
5154 MachineOperand &Src1 = MI.getOperand(2);
5155
5156 if (IsAdd && ST.hasLshlAddB64()) {
5157 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5158 Dest.getReg())
5159 .add(Src0)
5160 .addImm(0)
5161 .add(Src1);
5162 TII->legalizeOperands(*Add);
5163 MI.eraseFromParent();
5164 return BB;
5165 }
5166
5167 const auto *CarryRC = TRI->getWaveMaskRegClass();
5168
5169 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5170 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5171
5172 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5173 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5174
5175 const TargetRegisterClass *Src0RC = Src0.isReg()
5176 ? MRI.getRegClass(Src0.getReg())
5177 : &AMDGPU::VReg_64RegClass;
5178 const TargetRegisterClass *Src1RC = Src1.isReg()
5179 ? MRI.getRegClass(Src1.getReg())
5180 : &AMDGPU::VReg_64RegClass;
5181
5182 const TargetRegisterClass *Src0SubRC =
5183 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5184 const TargetRegisterClass *Src1SubRC =
5185 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5186
5187 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5188 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5189 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5190 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5191
5192 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5193 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5194 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5195 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5196
5197 unsigned LoOpc =
5198 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5199 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5200 .addReg(CarryReg, RegState::Define)
5201 .add(SrcReg0Sub0)
5202 .add(SrcReg1Sub0)
5203 .addImm(0); // clamp bit
5204
5205 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5206 MachineInstr *HiHalf =
5207 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5208 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5209 .add(SrcReg0Sub1)
5210 .add(SrcReg1Sub1)
5211 .addReg(CarryReg, RegState::Kill)
5212 .addImm(0); // clamp bit
5213
5214 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5215 .addReg(DestSub0)
5216 .addImm(AMDGPU::sub0)
5217 .addReg(DestSub1)
5218 .addImm(AMDGPU::sub1);
5219 TII->legalizeOperands(*LoHalf);
5220 TII->legalizeOperands(*HiHalf);
5221 MI.eraseFromParent();
5222 return BB;
5223 }
5224 case AMDGPU::S_ADD_CO_PSEUDO:
5225 case AMDGPU::S_SUB_CO_PSEUDO: {
5226 // This pseudo has a chance to be selected
5227 // only from uniform add/subcarry node. All the VGPR operands
5228 // therefore assumed to be splat vectors.
5230 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5231 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5233 const DebugLoc &DL = MI.getDebugLoc();
5234 MachineOperand &Dest = MI.getOperand(0);
5235 MachineOperand &CarryDest = MI.getOperand(1);
5236 MachineOperand &Src0 = MI.getOperand(2);
5237 MachineOperand &Src1 = MI.getOperand(3);
5238 MachineOperand &Src2 = MI.getOperand(4);
5239 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5240 ? AMDGPU::S_ADDC_U32
5241 : AMDGPU::S_SUBB_U32;
5242 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5243 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5244 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5245 .addReg(Src0.getReg());
5246 Src0.setReg(RegOp0);
5247 }
5248 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5249 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5250 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5251 .addReg(Src1.getReg());
5252 Src1.setReg(RegOp1);
5253 }
5254 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5255 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5256 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5257 .addReg(Src2.getReg());
5258 Src2.setReg(RegOp2);
5259 }
5260
5261 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5262 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5263 assert(WaveSize == 64 || WaveSize == 32);
5264
5265 if (WaveSize == 64) {
5266 if (ST.hasScalarCompareEq64()) {
5267 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5268 .addReg(Src2.getReg())
5269 .addImm(0);
5270 } else {
5271 const TargetRegisterClass *SubRC =
5272 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5273 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5274 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5275 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5276 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5277 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5278
5279 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5280 .add(Src2Sub0)
5281 .add(Src2Sub1);
5282
5283 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5284 .addReg(Src2_32, RegState::Kill)
5285 .addImm(0);
5286 }
5287 } else {
5288 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5289 .addReg(Src2.getReg())
5290 .addImm(0);
5291 }
5292
5293 // clang-format off
5294 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5295 .add(Src0)
5296 .add(Src1);
5297 // clang-format on
5298
5299 unsigned SelOpc =
5300 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5301
5302 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5303 .addImm(-1)
5304 .addImm(0);
5305
5306 MI.eraseFromParent();
5307 return BB;
5308 }
5309 case AMDGPU::SI_INIT_M0: {
5310 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5311 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5312 .add(MI.getOperand(0));
5313 MI.eraseFromParent();
5314 return BB;
5315 }
5316 case AMDGPU::GET_GROUPSTATICSIZE: {
5317 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5318 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5319 DebugLoc DL = MI.getDebugLoc();
5320 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5321 .add(MI.getOperand(0))
5322 .addImm(MFI->getLDSSize());
5323 MI.eraseFromParent();
5324 return BB;
5325 }
5326 case AMDGPU::GET_SHADERCYCLESHILO: {
5329 const DebugLoc &DL = MI.getDebugLoc();
5330 // The algorithm is:
5331 //
5332 // hi1 = getreg(SHADER_CYCLES_HI)
5333 // lo1 = getreg(SHADER_CYCLES_LO)
5334 // hi2 = getreg(SHADER_CYCLES_HI)
5335 //
5336 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5337 // Otherwise there was overflow and the result is hi2:0. In both cases the
5338 // result should represent the actual time at some point during the sequence
5339 // of three getregs.
5340 using namespace AMDGPU::Hwreg;
5341 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5342 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5343 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5344 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5345 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5346 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5347 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5348 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5349 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5350 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5351 .addReg(RegHi1)
5352 .addReg(RegHi2);
5353 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5354 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5355 .addReg(RegLo1)
5356 .addImm(0);
5357 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5358 .add(MI.getOperand(0))
5359 .addReg(RegLo)
5360 .addImm(AMDGPU::sub0)
5361 .addReg(RegHi2)
5362 .addImm(AMDGPU::sub1);
5363 MI.eraseFromParent();
5364 return BB;
5365 }
5366 case AMDGPU::SI_INDIRECT_SRC_V1:
5367 case AMDGPU::SI_INDIRECT_SRC_V2:
5368 case AMDGPU::SI_INDIRECT_SRC_V4:
5369 case AMDGPU::SI_INDIRECT_SRC_V8:
5370 case AMDGPU::SI_INDIRECT_SRC_V9:
5371 case AMDGPU::SI_INDIRECT_SRC_V10:
5372 case AMDGPU::SI_INDIRECT_SRC_V11:
5373 case AMDGPU::SI_INDIRECT_SRC_V12:
5374 case AMDGPU::SI_INDIRECT_SRC_V16:
5375 case AMDGPU::SI_INDIRECT_SRC_V32:
5376 return emitIndirectSrc(MI, *BB, *getSubtarget());
5377 case AMDGPU::SI_INDIRECT_DST_V1:
5378 case AMDGPU::SI_INDIRECT_DST_V2:
5379 case AMDGPU::SI_INDIRECT_DST_V4:
5380 case AMDGPU::SI_INDIRECT_DST_V8:
5381 case AMDGPU::SI_INDIRECT_DST_V9:
5382 case AMDGPU::SI_INDIRECT_DST_V10:
5383 case AMDGPU::SI_INDIRECT_DST_V11:
5384 case AMDGPU::SI_INDIRECT_DST_V12:
5385 case AMDGPU::SI_INDIRECT_DST_V16:
5386 case AMDGPU::SI_INDIRECT_DST_V32:
5387 return emitIndirectDst(MI, *BB, *getSubtarget());
5388 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5389 case AMDGPU::SI_KILL_I1_PSEUDO:
5390 return splitKillBlock(MI, BB);
5391 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5393 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5394 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5395
5396 Register Dst = MI.getOperand(0).getReg();
5397 const MachineOperand &Src0 = MI.getOperand(1);
5398 const MachineOperand &Src1 = MI.getOperand(2);
5399 const DebugLoc &DL = MI.getDebugLoc();
5400 Register SrcCond = MI.getOperand(3).getReg();
5401
5402 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5403 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5404 const auto *CondRC = TRI->getWaveMaskRegClass();
5405 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5406
5407 const TargetRegisterClass *Src0RC = Src0.isReg()
5408 ? MRI.getRegClass(Src0.getReg())
5409 : &AMDGPU::VReg_64RegClass;
5410 const TargetRegisterClass *Src1RC = Src1.isReg()
5411 ? MRI.getRegClass(Src1.getReg())
5412 : &AMDGPU::VReg_64RegClass;
5413
5414 const TargetRegisterClass *Src0SubRC =
5415 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5416 const TargetRegisterClass *Src1SubRC =
5417 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5418
5419 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5420 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5421 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5422 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5423
5424 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5425 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5426 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5427 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5428
5429 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5430 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5431 .addImm(0)
5432 .add(Src0Sub0)
5433 .addImm(0)
5434 .add(Src1Sub0)
5435 .addReg(SrcCondCopy);
5436 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5437 .addImm(0)
5438 .add(Src0Sub1)
5439 .addImm(0)
5440 .add(Src1Sub1)
5441 .addReg(SrcCondCopy);
5442
5443 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5444 .addReg(DstLo)
5445 .addImm(AMDGPU::sub0)
5446 .addReg(DstHi)
5447 .addImm(AMDGPU::sub1);
5448 MI.eraseFromParent();
5449 return BB;
5450 }
5451 case AMDGPU::SI_BR_UNDEF: {
5453 const DebugLoc &DL = MI.getDebugLoc();
5454 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5455 .add(MI.getOperand(0));
5456 Br->getOperand(1).setIsUndef(); // read undef SCC
5457 MI.eraseFromParent();
5458 return BB;
5459 }
5460 case AMDGPU::ADJCALLSTACKUP:
5461 case AMDGPU::ADJCALLSTACKDOWN: {
5463 MachineInstrBuilder MIB(*MF, &MI);
5464 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5465 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5466 return BB;
5467 }
5468 case AMDGPU::SI_CALL_ISEL: {
5470 const DebugLoc &DL = MI.getDebugLoc();
5471
5472 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5473
5475 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5476
5477 for (const MachineOperand &MO : MI.operands())
5478 MIB.add(MO);
5479
5480 MIB.cloneMemRefs(MI);
5481 MI.eraseFromParent();
5482 return BB;
5483 }
5484 case AMDGPU::V_ADD_CO_U32_e32:
5485 case AMDGPU::V_SUB_CO_U32_e32:
5486 case AMDGPU::V_SUBREV_CO_U32_e32: {
5487 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5488 const DebugLoc &DL = MI.getDebugLoc();
5489 unsigned Opc = MI.getOpcode();
5490
5491 bool NeedClampOperand = false;
5492 if (TII->pseudoToMCOpcode(Opc) == -1) {
5493 Opc = AMDGPU::getVOPe64(Opc);
5494 NeedClampOperand = true;
5495 }
5496
5497 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5498 if (TII->isVOP3(*I)) {
5499 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5500 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5501 I.addReg(TRI->getVCC(), RegState::Define);
5502 }
5503 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5504 if (NeedClampOperand)
5505 I.addImm(0); // clamp bit for e64 encoding
5506
5507 TII->legalizeOperands(*I);
5508
5509 MI.eraseFromParent();
5510 return BB;
5511 }
5512 case AMDGPU::V_ADDC_U32_e32:
5513 case AMDGPU::V_SUBB_U32_e32:
5514 case AMDGPU::V_SUBBREV_U32_e32:
5515 // These instructions have an implicit use of vcc which counts towards the
5516 // constant bus limit.
5517 TII->legalizeOperands(MI);
5518 return BB;
5519 case AMDGPU::DS_GWS_INIT:
5520 case AMDGPU::DS_GWS_SEMA_BR:
5521 case AMDGPU::DS_GWS_BARRIER:
5522 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5523 [[fallthrough]];
5524 case AMDGPU::DS_GWS_SEMA_V:
5525 case AMDGPU::DS_GWS_SEMA_P:
5526 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5527 // A s_waitcnt 0 is required to be the instruction immediately following.
5528 if (getSubtarget()->hasGWSAutoReplay()) {
5530 return BB;
5531 }
5532
5533 return emitGWSMemViolTestLoop(MI, BB);
5534 case AMDGPU::S_SETREG_B32: {
5535 // Try to optimize cases that only set the denormal mode or rounding mode.
5536 //
5537 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5538 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5539 // instead.
5540 //
5541 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5542 // allow you to have a no side effect instruction in the output of a
5543 // sideeffecting pattern.
5544 auto [ID, Offset, Width] =
5545 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5547 return BB;
5548
5549 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5550 const unsigned SetMask = WidthMask << Offset;
5551
5552 if (getSubtarget()->hasDenormModeInst()) {
5553 unsigned SetDenormOp = 0;
5554 unsigned SetRoundOp = 0;
5555
5556 // The dedicated instructions can only set the whole denorm or round mode
5557 // at once, not a subset of bits in either.
5558 if (SetMask ==
5560 // If this fully sets both the round and denorm mode, emit the two
5561 // dedicated instructions for these.
5562 SetRoundOp = AMDGPU::S_ROUND_MODE;
5563 SetDenormOp = AMDGPU::S_DENORM_MODE;
5564 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5565 SetRoundOp = AMDGPU::S_ROUND_MODE;
5566 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5567 SetDenormOp = AMDGPU::S_DENORM_MODE;
5568 }
5569
5570 if (SetRoundOp || SetDenormOp) {
5572 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5573 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5574 unsigned ImmVal = Def->getOperand(1).getImm();
5575 if (SetRoundOp) {
5576 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5577 .addImm(ImmVal & 0xf);
5578
5579 // If we also have the denorm mode, get just the denorm mode bits.
5580 ImmVal >>= 4;
5581 }
5582
5583 if (SetDenormOp) {
5584 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5585 .addImm(ImmVal & 0xf);
5586 }
5587
5588 MI.eraseFromParent();
5589 return BB;
5590 }
5591 }
5592 }
5593
5594 // If only FP bits are touched, used the no side effects pseudo.
5595 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5596 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5597 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5598
5599 return BB;
5600 }
5601 case AMDGPU::S_INVERSE_BALLOT_U32:
5602 case AMDGPU::S_INVERSE_BALLOT_U64:
5603 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5604 // necessary. After that they are equivalent to a COPY.
5605 MI.setDesc(TII->get(AMDGPU::COPY));
5606 return BB;
5607 case AMDGPU::ENDPGM_TRAP: {
5608 const DebugLoc &DL = MI.getDebugLoc();
5609 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5610 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5611 MI.addOperand(MachineOperand::CreateImm(0));
5612 return BB;
5613 }
5614
5615 // We need a block split to make the real endpgm a terminator. We also don't
5616 // want to break phis in successor blocks, so we can't just delete to the
5617 // end of the block.
5618
5619 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5621 MF->push_back(TrapBB);
5622 // clang-format off
5623 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5624 .addImm(0);
5625 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5626 .addMBB(TrapBB);
5627 // clang-format on
5628
5629 BB->addSuccessor(TrapBB);
5630 MI.eraseFromParent();
5631 return SplitBB;
5632 }
5633 case AMDGPU::SIMULATED_TRAP: {
5634 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5636 MachineBasicBlock *SplitBB =
5637 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5638 MI.eraseFromParent();
5639 return SplitBB;
5640 }
5641 default:
5642 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5643 if (!MI.mayStore())
5645 return BB;
5646 }
5648 }
5649}
5650
5652 // This currently forces unfolding various combinations of fsub into fma with
5653 // free fneg'd operands. As long as we have fast FMA (controlled by
5654 // isFMAFasterThanFMulAndFAdd), we should perform these.
5655
5656 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5657 // most of these combines appear to be cycle neutral but save on instruction
5658 // count / code size.
5659 return true;
5660}
5661
5663
5665 EVT VT) const {
5666 if (!VT.isVector()) {
5667 return MVT::i1;
5668 }
5669 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5670}
5671
5673 // TODO: Should i16 be used always if legal? For now it would force VALU
5674 // shifts.
5675 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5676}
5677
5679 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5680 ? Ty.changeElementSize(16)
5681 : Ty.changeElementSize(32);
5682}
5683
5684// Answering this is somewhat tricky and depends on the specific device which
5685// have different rates for fma or all f64 operations.
5686//
5687// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5688// regardless of which device (although the number of cycles differs between
5689// devices), so it is always profitable for f64.
5690//
5691// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5692// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5693// which we can always do even without fused FP ops since it returns the same
5694// result as the separate operations and since it is always full
5695// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5696// however does not support denormals, so we do report fma as faster if we have
5697// a fast fma device and require denormals.
5698//
5700 EVT VT) const {
5701 VT = VT.getScalarType();
5702
5703 switch (VT.getSimpleVT().SimpleTy) {
5704 case MVT::f32: {
5705 // If mad is not available this depends only on if f32 fma is full rate.
5706 if (!Subtarget->hasMadMacF32Insts())
5707 return Subtarget->hasFastFMAF32();
5708
5709 // Otherwise f32 mad is always full rate and returns the same result as
5710 // the separate operations so should be preferred over fma.
5711 // However does not support denormals.
5713 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5714
5715 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5716 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5717 }
5718 case MVT::f64:
5719 return true;
5720 case MVT::f16:
5721 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5722 default:
5723 break;
5724 }
5725
5726 return false;
5727}
5728
5730 LLT Ty) const {
5731 switch (Ty.getScalarSizeInBits()) {
5732 case 16:
5733 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5734 case 32:
5735 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5736 case 64:
5737 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5738 default:
5739 break;
5740 }
5741
5742 return false;
5743}
5744
5745// Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for
5746// specific details.
5748 Type *Ty) const {
5749 switch (Ty->getScalarSizeInBits()) {
5750 case 16: {
5752 return Subtarget->has16BitInsts() &&
5753 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
5754 }
5755 case 32: {
5756 if (!Subtarget->hasMadMacF32Insts())
5757 return Subtarget->hasFastFMAF32();
5758
5760 if (Mode.FP32Denormals != DenormalMode::getPreserveSign())
5761 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5762
5763 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5764 }
5765 case 64:
5766 return true;
5767 default:
5768 break;
5769 }
5770
5771 return false;
5772}
5773
5775 if (!Ty.isScalar())
5776 return false;
5777
5778 if (Ty.getScalarSizeInBits() == 16)
5779 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5780 if (Ty.getScalarSizeInBits() == 32)
5781 return Subtarget->hasMadMacF32Insts() &&
5782 denormalModeIsFlushAllF32(*MI.getMF());
5783
5784 return false;
5785}
5786
5788 const SDNode *N) const {
5789 // TODO: Check future ftz flag
5790 // v_mad_f32/v_mac_f32 do not support denormals.
5791 EVT VT = N->getValueType(0);
5792 if (VT == MVT::f32)
5793 return Subtarget->hasMadMacF32Insts() &&
5795 if (VT == MVT::f16) {
5796 return Subtarget->hasMadF16() &&
5798 }
5799
5800 return false;
5801}
5802
5803//===----------------------------------------------------------------------===//
5804// Custom DAG Lowering Operations
5805//===----------------------------------------------------------------------===//
5806
5807// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5808// wider vector type is legal.
5810 SelectionDAG &DAG) const {
5811 unsigned Opc = Op.getOpcode();
5812 EVT VT = Op.getValueType();
5813 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5814 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5815 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5816 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5817
5818 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
5819
5820 SDLoc SL(Op);
5821 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
5822 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
5823
5824 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5825}
5826
5827// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5828// wider vector type is legal.
5830 SelectionDAG &DAG) const {
5831 unsigned Opc = Op.getOpcode();
5832 EVT VT = Op.getValueType();
5833 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5834 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5835 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5836 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5837
5838 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
5839 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5840
5841 SDLoc SL(Op);
5842
5843 SDValue OpLo =
5844 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
5845 SDValue OpHi =
5846 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
5847
5848 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5849}
5850
5852 SelectionDAG &DAG) const {
5853 unsigned Opc = Op.getOpcode();
5854 EVT VT = Op.getValueType();
5855 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5856 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5857 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5858 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5859 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5860 VT == MVT::v32bf16);
5861
5862 SDValue Op0 = Op.getOperand(0);
5863 auto [Lo0, Hi0] = Op0.getValueType().isVector()
5864 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5865 : std::pair(Op0, Op0);
5866
5867 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5868 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
5869
5870 SDLoc SL(Op);
5871 auto ResVT = DAG.GetSplitDestVTs(VT);
5872
5873 SDValue OpLo =
5874 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
5875 SDValue OpHi =
5876 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
5877
5878 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5879}
5880
5882 switch (Op.getOpcode()) {
5883 default:
5885 case ISD::BRCOND:
5886 return LowerBRCOND(Op, DAG);
5887 case ISD::RETURNADDR:
5888 return LowerRETURNADDR(Op, DAG);
5889 case ISD::LOAD: {
5890 SDValue Result = LowerLOAD(Op, DAG);
5891 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5892 "Load should return a value and a chain");
5893 return Result;
5894 }
5895 case ISD::FSQRT: {
5896 EVT VT = Op.getValueType();
5897 if (VT == MVT::f32)
5898 return lowerFSQRTF32(Op, DAG);
5899 if (VT == MVT::f64)
5900 return lowerFSQRTF64(Op, DAG);
5901 return SDValue();
5902 }
5903 case ISD::FSIN:
5904 case ISD::FCOS:
5905 return LowerTrig(Op, DAG);
5906 case ISD::SELECT:
5907 return LowerSELECT(Op, DAG);
5908 case ISD::FDIV:
5909 return LowerFDIV(Op, DAG);
5910 case ISD::FFREXP:
5911 return LowerFFREXP(Op, DAG);
5913 return LowerATOMIC_CMP_SWAP(Op, DAG);
5914 case ISD::STORE:
5915 return LowerSTORE(Op, DAG);
5916 case ISD::GlobalAddress: {
5919 return LowerGlobalAddress(MFI, Op, DAG);
5920 }
5922 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5924 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5926 return LowerINTRINSIC_VOID(Op, DAG);
5927 case ISD::ADDRSPACECAST:
5928 return lowerADDRSPACECAST(Op, DAG);
5930 return lowerINSERT_SUBVECTOR(Op, DAG);
5932 return lowerINSERT_VECTOR_ELT(Op, DAG);
5934 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5936 return lowerVECTOR_SHUFFLE(Op, DAG);
5938 return lowerSCALAR_TO_VECTOR(Op, DAG);
5939 case ISD::BUILD_VECTOR:
5940 return lowerBUILD_VECTOR(Op, DAG);
5941 case ISD::FP_ROUND:
5943 return lowerFP_ROUND(Op, DAG);
5944 case ISD::TRAP:
5945 return lowerTRAP(Op, DAG);
5946 case ISD::DEBUGTRAP:
5947 return lowerDEBUGTRAP(Op, DAG);
5948 case ISD::ABS:
5949 case ISD::FABS:
5950 case ISD::FNEG:
5951 case ISD::FCANONICALIZE:
5952 case ISD::BSWAP:
5953 return splitUnaryVectorOp(Op, DAG);
5954 case ISD::FMINNUM:
5955 case ISD::FMAXNUM:
5956 return lowerFMINNUM_FMAXNUM(Op, DAG);
5957 case ISD::FLDEXP:
5958 case ISD::STRICT_FLDEXP:
5959 return lowerFLDEXP(Op, DAG);
5960 case ISD::FMA:
5961 return splitTernaryVectorOp(Op, DAG);
5962 case ISD::FP_TO_SINT:
5963 case ISD::FP_TO_UINT:
5964 return LowerFP_TO_INT(Op, DAG);
5965 case ISD::SHL:
5966 case ISD::SRA:
5967 case ISD::SRL:
5968 case ISD::ADD:
5969 case ISD::SUB:
5970 case ISD::SMIN:
5971 case ISD::SMAX:
5972 case ISD::UMIN:
5973 case ISD::UMAX:
5974 case ISD::FADD:
5975 case ISD::FMUL:
5976 case ISD::FMINNUM_IEEE:
5977 case ISD::FMAXNUM_IEEE:
5978 case ISD::FMINIMUM:
5979 case ISD::FMAXIMUM:
5980 case ISD::FMINIMUMNUM:
5981 case ISD::FMAXIMUMNUM:
5982 case ISD::UADDSAT:
5983 case ISD::USUBSAT:
5984 case ISD::SADDSAT:
5985 case ISD::SSUBSAT:
5986 return splitBinaryVectorOp(Op, DAG);
5987 case ISD::MUL:
5988 return lowerMUL(Op, DAG);
5989 case ISD::SMULO:
5990 case ISD::UMULO:
5991 return lowerXMULO(Op, DAG);
5992 case ISD::SMUL_LOHI:
5993 case ISD::UMUL_LOHI:
5994 return lowerXMUL_LOHI(Op, DAG);
5996 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5997 case ISD::STACKSAVE:
5998 return LowerSTACKSAVE(Op, DAG);
5999 case ISD::GET_ROUNDING:
6000 return lowerGET_ROUNDING(Op, DAG);
6001 case ISD::SET_ROUNDING:
6002 return lowerSET_ROUNDING(Op, DAG);
6003 case ISD::PREFETCH:
6004 return lowerPREFETCH(Op, DAG);
6005 case ISD::FP_EXTEND:
6007 return lowerFP_EXTEND(Op, DAG);
6008 case ISD::GET_FPENV:
6009 return lowerGET_FPENV(Op, DAG);
6010 case ISD::SET_FPENV:
6011 return lowerSET_FPENV(Op, DAG);
6012 }
6013 return SDValue();
6014}
6015
6016// Used for D16: Casts the result of an instruction into the right vector,
6017// packs values if loads return unpacked values.
6019 const SDLoc &DL, SelectionDAG &DAG,
6020 bool Unpacked) {
6021 if (!LoadVT.isVector())
6022 return Result;
6023
6024 // Cast back to the original packed type or to a larger type that is a
6025 // multiple of 32 bit for D16. Widening the return type is a required for
6026 // legalization.
6027 EVT FittingLoadVT = LoadVT;
6028 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6029 FittingLoadVT =
6031 LoadVT.getVectorNumElements() + 1);
6032 }
6033
6034 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6035 // Truncate to v2i16/v4i16.
6036 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6037
6038 // Workaround legalizer not scalarizing truncate after vector op
6039 // legalization but not creating intermediate vector trunc.
6041 DAG.ExtractVectorElements(Result, Elts);
6042 for (SDValue &Elt : Elts)
6043 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6044
6045 // Pad illegal v1i16/v3fi6 to v4i16
6046 if ((LoadVT.getVectorNumElements() % 2) == 1)
6047 Elts.push_back(DAG.getUNDEF(MVT::i16));
6048
6049 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6050
6051 // Bitcast to original type (v2f16/v4f16).
6052 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6053 }
6054
6055 // Cast back to the original packed type.
6056 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6057}
6058
6059SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6060 SelectionDAG &DAG,
6062 bool IsIntrinsic) const {
6063 SDLoc DL(M);
6064
6065 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6066 EVT LoadVT = M->getValueType(0);
6067
6068 EVT EquivLoadVT = LoadVT;
6069 if (LoadVT.isVector()) {
6070 if (Unpacked) {
6071 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6072 LoadVT.getVectorNumElements());
6073 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6074 // Widen v3f16 to legal type
6075 EquivLoadVT =
6077 LoadVT.getVectorNumElements() + 1);
6078 }
6079 }
6080
6081 // Change from v4f16/v2f16 to EquivLoadVT.
6082 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6083
6085 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6086 M->getMemoryVT(), M->getMemOperand());
6087
6088 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6089
6090 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6091}
6092
6093SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6094 SelectionDAG &DAG,
6095 ArrayRef<SDValue> Ops) const {
6096 SDLoc DL(M);
6097 EVT LoadVT = M->getValueType(0);
6098 EVT EltType = LoadVT.getScalarType();
6099 EVT IntVT = LoadVT.changeTypeToInteger();
6100
6101 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6102
6103 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6104 bool IsTFE = M->getNumValues() == 3;
6105
6106 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6110
6111 if (IsD16) {
6112 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6113 }
6114
6115 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6116 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6117 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6118 IsTFE);
6119
6120 if (isTypeLegal(LoadVT)) {
6121 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6122 M->getMemOperand(), DAG);
6123 }
6124
6125 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6126 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6127 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6128 M->getMemOperand(), DAG);
6129 return DAG.getMergeValues(
6130 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6131 DL);
6132}
6133
6135 SelectionDAG &DAG) {
6136 EVT VT = N->getValueType(0);
6137 unsigned CondCode = N->getConstantOperandVal(3);
6138 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6139 return DAG.getUNDEF(VT);
6140
6141 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6142
6143 SDValue LHS = N->getOperand(1);
6144 SDValue RHS = N->getOperand(2);
6145
6146 SDLoc DL(N);
6147
6148 EVT CmpVT = LHS.getValueType();
6149 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6150 unsigned PromoteOp =
6152 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6153 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6154 }
6155
6156 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6157
6158 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6159 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6160
6161 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6162 DAG.getCondCode(CCOpcode));
6163 if (VT.bitsEq(CCVT))
6164 return SetCC;
6165 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6166}
6167
6169 SelectionDAG &DAG) {
6170 EVT VT = N->getValueType(0);
6171
6172 unsigned CondCode = N->getConstantOperandVal(3);
6173 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6174 return DAG.getUNDEF(VT);
6175
6176 SDValue Src0 = N->getOperand(1);
6177 SDValue Src1 = N->getOperand(2);
6178 EVT CmpVT = Src0.getValueType();
6179 SDLoc SL(N);
6180
6181 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6182 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6183 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6184 }
6185
6186 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6187 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6188 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6189 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6190 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6191 DAG.getCondCode(CCOpcode));
6192 if (VT.bitsEq(CCVT))
6193 return SetCC;
6194 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6195}
6196
6198 SelectionDAG &DAG) {
6199 EVT VT = N->getValueType(0);
6200 SDValue Src = N->getOperand(1);
6201 SDLoc SL(N);
6202
6203 if (Src.getOpcode() == ISD::SETCC) {
6204 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6205 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6206 Src.getOperand(1), Src.getOperand(2));
6207 }
6208 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6209 // (ballot 0) -> 0
6210 if (Arg->isZero())
6211 return DAG.getConstant(0, SL, VT);
6212
6213 // (ballot 1) -> EXEC/EXEC_LO
6214 if (Arg->isOne()) {
6215 Register Exec;
6216 if (VT.getScalarSizeInBits() == 32)
6217 Exec = AMDGPU::EXEC_LO;
6218 else if (VT.getScalarSizeInBits() == 64)
6219 Exec = AMDGPU::EXEC;
6220 else
6221 return SDValue();
6222
6223 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6224 }
6225 }
6226
6227 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6228 // ISD::SETNE)
6229 return DAG.getNode(
6230 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6231 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6232}
6233
6235 SelectionDAG &DAG) {
6236 EVT VT = N->getValueType(0);
6237 unsigned ValSize = VT.getSizeInBits();
6238 unsigned IID = N->getConstantOperandVal(0);
6239 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6240 IID == Intrinsic::amdgcn_permlanex16;
6241 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6242 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6243 SDLoc SL(N);
6244 MVT IntVT = MVT::getIntegerVT(ValSize);
6245 const GCNSubtarget *ST = TLI.getSubtarget();
6246 unsigned SplitSize = 32;
6247 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6248 ST->hasDPALU_DPP() &&
6249 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6250 SplitSize = 64;
6251
6252 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6253 SDValue Src2, MVT ValT) -> SDValue {
6255 switch (IID) {
6256 case Intrinsic::amdgcn_permlane16:
6257 case Intrinsic::amdgcn_permlanex16:
6258 case Intrinsic::amdgcn_update_dpp:
6259 Operands.push_back(N->getOperand(6));
6260 Operands.push_back(N->getOperand(5));
6261 Operands.push_back(N->getOperand(4));
6262 [[fallthrough]];
6263 case Intrinsic::amdgcn_writelane:
6264 Operands.push_back(Src2);
6265 [[fallthrough]];
6266 case Intrinsic::amdgcn_readlane:
6267 case Intrinsic::amdgcn_set_inactive:
6268 case Intrinsic::amdgcn_set_inactive_chain_arg:
6269 case Intrinsic::amdgcn_mov_dpp8:
6270 Operands.push_back(Src1);
6271 [[fallthrough]];
6272 case Intrinsic::amdgcn_readfirstlane:
6273 case Intrinsic::amdgcn_permlane64:
6274 Operands.push_back(Src0);
6275 break;
6276 default:
6277 llvm_unreachable("unhandled lane op");
6278 }
6279
6280 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6281 std::reverse(Operands.begin(), Operands.end());
6282
6283 if (SDNode *GL = N->getGluedNode()) {
6284 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6285 GL = GL->getOperand(0).getNode();
6286 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6287 SDValue(GL, 0)));
6288 }
6289
6290 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6291 };
6292
6293 SDValue Src0 = N->getOperand(1);
6294 SDValue Src1, Src2;
6295 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6296 IID == Intrinsic::amdgcn_mov_dpp8 ||
6297 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6298 Src1 = N->getOperand(2);
6299 if (IID == Intrinsic::amdgcn_writelane ||
6300 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6301 Src2 = N->getOperand(3);
6302 }
6303
6304 if (ValSize == SplitSize) {
6305 // Already legal
6306 return SDValue();
6307 }
6308
6309 if (ValSize < 32) {
6310 bool IsFloat = VT.isFloatingPoint();
6311 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6312 SL, MVT::i32);
6313
6314 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6315 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6316 SL, MVT::i32);
6317 }
6318
6319 if (IID == Intrinsic::amdgcn_writelane) {
6320 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6321 SL, MVT::i32);
6322 }
6323
6324 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6325 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6326 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6327 }
6328
6329 if (ValSize % SplitSize != 0)
6330 return SDValue();
6331
6332 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6333 EVT VT = N->getValueType(0);
6334 unsigned NE = VT.getVectorNumElements();
6335 EVT EltVT = VT.getVectorElementType();
6337 unsigned NumOperands = N->getNumOperands();
6338 SmallVector<SDValue, 4> Operands(NumOperands);
6339 SDNode *GL = N->getGluedNode();
6340
6341 // only handle convergencectrl_glue
6343
6344 for (unsigned i = 0; i != NE; ++i) {
6345 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6346 ++j) {
6347 SDValue Operand = N->getOperand(j);
6348 EVT OperandVT = Operand.getValueType();
6349 if (OperandVT.isVector()) {
6350 // A vector operand; extract a single element.
6351 EVT OperandEltVT = OperandVT.getVectorElementType();
6352 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6353 Operand, DAG.getVectorIdxConstant(i, SL));
6354 } else {
6355 // A scalar operand; just use it as is.
6356 Operands[j] = Operand;
6357 }
6358 }
6359
6360 if (GL)
6361 Operands[NumOperands - 1] =
6362 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6363 SDValue(GL->getOperand(0).getNode(), 0));
6364
6365 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6366 }
6367
6368 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6369 return DAG.getBuildVector(VecVT, SL, Scalars);
6370 };
6371
6372 if (VT.isVector()) {
6373 switch (MVT::SimpleValueType EltTy =
6375 case MVT::i32:
6376 case MVT::f32:
6377 if (SplitSize == 32) {
6378 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6379 return unrollLaneOp(LaneOp.getNode());
6380 }
6381 [[fallthrough]];
6382 case MVT::i16:
6383 case MVT::f16:
6384 case MVT::bf16: {
6385 unsigned SubVecNumElt =
6386 SplitSize / VT.getVectorElementType().getSizeInBits();
6387 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6389 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6390 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6391 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6392 DAG.getConstant(EltIdx, SL, MVT::i32));
6393
6394 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6395 IsPermLane16)
6396 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6397 DAG.getConstant(EltIdx, SL, MVT::i32));
6398
6399 if (IID == Intrinsic::amdgcn_writelane)
6400 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6401 DAG.getConstant(EltIdx, SL, MVT::i32));
6402
6403 Pieces.push_back(
6404 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6405 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6406 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6407 EltIdx += SubVecNumElt;
6408 }
6409 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6410 }
6411 default:
6412 // Handle all other cases by bitcasting to i32 vectors
6413 break;
6414 }
6415 }
6416
6417 MVT VecVT =
6418 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6419 Src0 = DAG.getBitcast(VecVT, Src0);
6420
6421 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6422 Src1 = DAG.getBitcast(VecVT, Src1);
6423
6424 if (IID == Intrinsic::amdgcn_writelane)
6425 Src2 = DAG.getBitcast(VecVT, Src2);
6426
6427 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6428 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6429 return DAG.getBitcast(VT, UnrolledLaneOp);
6430}
6431
6434 SelectionDAG &DAG) const {
6435 switch (N->getOpcode()) {
6437 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6438 Results.push_back(Res);
6439 return;
6440 }
6442 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6443 Results.push_back(Res);
6444 return;
6445 }
6447 unsigned IID = N->getConstantOperandVal(0);
6448 switch (IID) {
6449 case Intrinsic::amdgcn_make_buffer_rsrc:
6450 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6451 return;
6452 case Intrinsic::amdgcn_cvt_pkrtz: {
6453 SDValue Src0 = N->getOperand(1);
6454 SDValue Src1 = N->getOperand(2);
6455 SDLoc SL(N);
6456 SDValue Cvt =
6457 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6458 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6459 return;
6460 }
6461 case Intrinsic::amdgcn_cvt_pknorm_i16:
6462 case Intrinsic::amdgcn_cvt_pknorm_u16:
6463 case Intrinsic::amdgcn_cvt_pk_i16:
6464 case Intrinsic::amdgcn_cvt_pk_u16: {
6465 SDValue Src0 = N->getOperand(1);
6466 SDValue Src1 = N->getOperand(2);
6467 SDLoc SL(N);
6468 unsigned Opcode;
6469
6470 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6472 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6474 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6476 else
6478
6479 EVT VT = N->getValueType(0);
6480 if (isTypeLegal(VT))
6481 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6482 else {
6483 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6484 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6485 }
6486 return;
6487 }
6488 case Intrinsic::amdgcn_s_buffer_load: {
6489 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6490 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6491 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6492 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6493 // s_buffer_load_i8.
6494 if (!Subtarget->hasScalarSubwordLoads())
6495 return;
6496 SDValue Op = SDValue(N, 0);
6497 SDValue Rsrc = Op.getOperand(1);
6498 SDValue Offset = Op.getOperand(2);
6499 SDValue CachePolicy = Op.getOperand(3);
6500 EVT VT = Op.getValueType();
6501 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6502 SDLoc DL(Op);
6504 const DataLayout &DataLayout = DAG.getDataLayout();
6505 Align Alignment =
6511 VT.getStoreSize(), Alignment);
6512 SDValue LoadVal;
6513 if (!Offset->isDivergent()) {
6514 SDValue Ops[] = {Rsrc, // source register
6515 Offset, CachePolicy};
6516 SDValue BufferLoad =
6518 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6519 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6520 } else {
6521 SDValue Ops[] = {
6522 DAG.getEntryNode(), // Chain
6523 Rsrc, // rsrc
6524 DAG.getConstant(0, DL, MVT::i32), // vindex
6525 {}, // voffset
6526 {}, // soffset
6527 {}, // offset
6528 CachePolicy, // cachepolicy
6529 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6530 };
6531 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6532 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6533 }
6534 Results.push_back(LoadVal);
6535 return;
6536 }
6537 }
6538 break;
6539 }
6541 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6542 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6543 // FIXME: Hacky
6544 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6545 Results.push_back(Res.getOperand(I));
6546 }
6547 } else {
6548 Results.push_back(Res);
6549 Results.push_back(Res.getValue(1));
6550 }
6551 return;
6552 }
6553
6554 break;
6555 }
6556 case ISD::SELECT: {
6557 SDLoc SL(N);
6558 EVT VT = N->getValueType(0);
6559 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6560 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6561 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6562
6563 EVT SelectVT = NewVT;
6564 if (NewVT.bitsLT(MVT::i32)) {
6565 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6566 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6567 SelectVT = MVT::i32;
6568 }
6569
6570 SDValue NewSelect =
6571 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6572
6573 if (NewVT != SelectVT)
6574 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6575 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6576 return;
6577 }
6578 case ISD::FNEG: {
6579 if (N->getValueType(0) != MVT::v2f16)
6580 break;
6581
6582 SDLoc SL(N);
6583 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6584
6585 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6586 DAG.getConstant(0x80008000, SL, MVT::i32));
6587 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6588 return;
6589 }
6590 case ISD::FABS: {
6591 if (N->getValueType(0) != MVT::v2f16)
6592 break;
6593
6594 SDLoc SL(N);
6595 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6596
6597 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6598 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6599 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6600 return;
6601 }
6602 case ISD::FSQRT: {
6603 if (N->getValueType(0) != MVT::f16)
6604 break;
6605 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6606 break;
6607 }
6608 default:
6610 break;
6611 }
6612}
6613
6614/// Helper function for LowerBRCOND
6615static SDNode *findUser(SDValue Value, unsigned Opcode) {
6616
6617 for (SDUse &U : Value->uses()) {
6618 if (U.get() != Value)
6619 continue;
6620
6621 if (U.getUser()->getOpcode() == Opcode)
6622 return U.getUser();
6623 }
6624 return nullptr;
6625}
6626
6627unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6628 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6629 switch (Intr->getConstantOperandVal(1)) {
6630 case Intrinsic::amdgcn_if:
6631 return AMDGPUISD::IF;
6632 case Intrinsic::amdgcn_else:
6633 return AMDGPUISD::ELSE;
6634 case Intrinsic::amdgcn_loop:
6635 return AMDGPUISD::LOOP;
6636 case Intrinsic::amdgcn_end_cf:
6637 llvm_unreachable("should not occur");
6638 default:
6639 return 0;
6640 }
6641 }
6642
6643 // break, if_break, else_break are all only used as inputs to loop, not
6644 // directly as branch conditions.
6645 return 0;
6646}
6647
6649 const Triple &TT = getTargetMachine().getTargetTriple();
6653}
6654
6656 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6657 return false;
6658
6659 // FIXME: Either avoid relying on address space here or change the default
6660 // address space for functions to avoid the explicit check.
6661 return (GV->getValueType()->isFunctionTy() ||
6664}
6665
6667 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6668}
6669
6671 if (!GV->hasExternalLinkage())
6672 return true;
6673
6674 const auto OS = getTargetMachine().getTargetTriple().getOS();
6675 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6676}
6677
6678/// This transforms the control flow intrinsics to get the branch destination as
6679/// last parameter, also switches branch target with BR if the need arise
6680SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6681 SDLoc DL(BRCOND);
6682
6683 SDNode *Intr = BRCOND.getOperand(1).getNode();
6684 SDValue Target = BRCOND.getOperand(2);
6685 SDNode *BR = nullptr;
6686 SDNode *SetCC = nullptr;
6687
6688 if (Intr->getOpcode() == ISD::SETCC) {
6689 // As long as we negate the condition everything is fine
6690 SetCC = Intr;
6691 Intr = SetCC->getOperand(0).getNode();
6692
6693 } else {
6694 // Get the target from BR if we don't negate the condition
6695 BR = findUser(BRCOND, ISD::BR);
6696 assert(BR && "brcond missing unconditional branch user");
6697 Target = BR->getOperand(1);
6698 }
6699
6700 unsigned CFNode = isCFIntrinsic(Intr);
6701 if (CFNode == 0) {
6702 // This is a uniform branch so we don't need to legalize.
6703 return BRCOND;
6704 }
6705
6706 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6707 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6708
6709 assert(!SetCC ||
6710 (SetCC->getConstantOperandVal(1) == 1 &&
6711 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6712 ISD::SETNE));
6713
6714 // operands of the new intrinsic call
6716 if (HaveChain)
6717 Ops.push_back(BRCOND.getOperand(0));
6718
6719 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6720 Ops.push_back(Target);
6721
6722 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6723
6724 // build the new intrinsic call
6725 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6726
6727 if (!HaveChain) {
6728 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6729
6730 Result = DAG.getMergeValues(Ops, DL).getNode();
6731 }
6732
6733 if (BR) {
6734 // Give the branch instruction our target
6735 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6736 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6737 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6738 }
6739
6740 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6741
6742 // Copy the intrinsic results to registers
6743 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6745 if (!CopyToReg)
6746 continue;
6747
6748 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6749 SDValue(Result, i - 1), SDValue());
6750
6751 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6752 }
6753
6754 // Remove the old intrinsic from the chain
6755 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6756 Intr->getOperand(0));
6757
6758 return Chain;
6759}
6760
6761SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6762 MVT VT = Op.getSimpleValueType();
6763 SDLoc DL(Op);
6764 // Checking the depth
6765 if (Op.getConstantOperandVal(0) != 0)
6766 return DAG.getConstant(0, DL, VT);
6767
6770 // Check for kernel and shader functions
6771 if (Info->isEntryFunction())
6772 return DAG.getConstant(0, DL, VT);
6773
6774 MachineFrameInfo &MFI = MF.getFrameInfo();
6775 // There is a call to @llvm.returnaddress in this function
6776 MFI.setReturnAddressIsTaken(true);
6777
6779 // Get the return address reg and mark it as an implicit live-in
6780 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
6781 getRegClassFor(VT, Op.getNode()->isDivergent()));
6782
6783 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6784}
6785
6786SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
6787 const SDLoc &DL, EVT VT) const {
6788 return Op.getValueType().bitsLE(VT)
6789 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
6790 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6791 DAG.getTargetConstant(0, DL, MVT::i32));
6792}
6793
6794SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6795 assert(Op.getValueType() == MVT::f16 &&
6796 "Do not know how to custom lower FP_ROUND for non-f16 type");
6797
6798 SDValue Src = Op.getOperand(0);
6799 EVT SrcVT = Src.getValueType();
6800 if (SrcVT != MVT::f64)
6801 return Op;
6802
6803 // TODO: Handle strictfp
6804 if (Op.getOpcode() != ISD::FP_ROUND)
6805 return Op;
6806
6807 SDLoc DL(Op);
6808
6809 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6810 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6811 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6812}
6813
6814SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6815 SelectionDAG &DAG) const {
6816 EVT VT = Op.getValueType();
6817 const MachineFunction &MF = DAG.getMachineFunction();
6819 bool IsIEEEMode = Info->getMode().IEEE;
6820
6821 // FIXME: Assert during selection that this is only selected for
6822 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6823 // mode functions, but this happens to be OK since it's only done in cases
6824 // where there is known no sNaN.
6825 if (IsIEEEMode)
6826 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6827
6828 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6829 VT == MVT::v16bf16)
6830 return splitBinaryVectorOp(Op, DAG);
6831 return Op;
6832}
6833
6834SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6835 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6836 EVT VT = Op.getValueType();
6837 assert(VT == MVT::f16);
6838
6839 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6840 EVT ExpVT = Exp.getValueType();
6841 if (ExpVT == MVT::i16)
6842 return Op;
6843
6844 SDLoc DL(Op);
6845
6846 // Correct the exponent type for f16 to i16.
6847 // Clamp the range of the exponent to the instruction's range.
6848
6849 // TODO: This should be a generic narrowing legalization, and can easily be
6850 // for GlobalISel.
6851
6852 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
6853 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6854
6855 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
6856 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6857
6858 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6859
6860 if (IsStrict) {
6861 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6862 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6863 }
6864
6865 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6866}
6867
6869 switch (Op->getOpcode()) {
6870 case ISD::SRA:
6871 case ISD::SMIN:
6872 case ISD::SMAX:
6873 return ISD::SIGN_EXTEND;
6874 case ISD::SRL:
6875 case ISD::UMIN:
6876 case ISD::UMAX:
6877 return ISD::ZERO_EXTEND;
6878 case ISD::ADD:
6879 case ISD::SUB:
6880 case ISD::AND:
6881 case ISD::OR:
6882 case ISD::XOR:
6883 case ISD::SHL:
6884 case ISD::SELECT:
6885 case ISD::MUL:
6886 // operation result won't be influenced by garbage high bits.
6887 // TODO: are all of those cases correct, and are there more?
6888 return ISD::ANY_EXTEND;
6889 case ISD::SETCC: {
6890 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6892 }
6893 default:
6894 llvm_unreachable("unexpected opcode!");
6895 }
6896}
6897
6898SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6899 DAGCombinerInfo &DCI) const {
6900 const unsigned Opc = Op.getOpcode();
6901 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6902 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6903 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6904 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6905 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6906
6907 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6908 : Op->getOperand(0).getValueType();
6909 auto ExtTy = OpTy.changeElementType(MVT::i32);
6910
6911 if (DCI.isBeforeLegalizeOps() ||
6912 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6913 return SDValue();
6914
6915 auto &DAG = DCI.DAG;
6916
6917 SDLoc DL(Op);
6918 SDValue LHS;
6919 SDValue RHS;
6920 if (Opc == ISD::SELECT) {
6921 LHS = Op->getOperand(1);
6922 RHS = Op->getOperand(2);
6923 } else {
6924 LHS = Op->getOperand(0);
6925 RHS = Op->getOperand(1);
6926 }
6927
6928 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6929 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6930
6931 // Special case: for shifts, the RHS always needs a zext.
6932 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6933 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6934 else
6935 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6936
6937 // setcc always return i1/i1 vec so no need to truncate after.
6938 if (Opc == ISD::SETCC) {
6939 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6940 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6941 }
6942
6943 // For other ops, we extend the operation's return type as well so we need to
6944 // truncate back to the original type.
6945 SDValue NewVal;
6946 if (Opc == ISD::SELECT)
6947 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6948 else
6949 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6950
6951 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6952}
6953
6954// Custom lowering for vector multiplications and s_mul_u64.
6955SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6956 EVT VT = Op.getValueType();
6957
6958 // Split vector operands.
6959 if (VT.isVector())
6960 return splitBinaryVectorOp(Op, DAG);
6961
6962 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6963
6964 // There are four ways to lower s_mul_u64:
6965 //
6966 // 1. If all the operands are uniform, then we lower it as it is.
6967 //
6968 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6969 // multiplications because there is not a vector equivalent of s_mul_u64.
6970 //
6971 // 3. If the cost model decides that it is more efficient to use vector
6972 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6973 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6974 //
6975 // 4. If the cost model decides to use vector registers and both of the
6976 // operands are zero-extended/sign-extended from 32-bits, then we split the
6977 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6978 // possible to check if the operands are zero-extended or sign-extended in
6979 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6980 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6981 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6982 // If the cost model decides that we have to use vector registers, then
6983 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6984 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6985 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6986 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6987 // SIInstrInfo.cpp .
6988
6989 if (Op->isDivergent())
6990 return SDValue();
6991
6992 SDValue Op0 = Op.getOperand(0);
6993 SDValue Op1 = Op.getOperand(1);
6994 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6995 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6996 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6997 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6998 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6999 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7000 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7001 SDLoc SL(Op);
7002 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7003 return SDValue(
7004 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7005 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7006 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7007 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7008 return SDValue(
7009 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7010 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7011 return Op;
7012}
7013
7014SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7015 EVT VT = Op.getValueType();
7016 SDLoc SL(Op);
7017 SDValue LHS = Op.getOperand(0);
7018 SDValue RHS = Op.getOperand(1);
7019 bool isSigned = Op.getOpcode() == ISD::SMULO;
7020
7021 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7022 const APInt &C = RHSC->getAPIntValue();
7023 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7024 if (C.isPowerOf2()) {
7025 // smulo(x, signed_min) is same as umulo(x, signed_min).
7026 bool UseArithShift = isSigned && !C.isMinSignedValue();
7027 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7028 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7029 SDValue Overflow =
7030 DAG.getSetCC(SL, MVT::i1,
7031 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7032 Result, ShiftAmt),
7033 LHS, ISD::SETNE);
7034 return DAG.getMergeValues({Result, Overflow}, SL);
7035 }
7036 }
7037
7038 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7039 SDValue Top =
7040 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7041
7042 SDValue Sign = isSigned
7043 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7044 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7045 SL, MVT::i32))
7046 : DAG.getConstant(0, SL, VT);
7047 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7048
7049 return DAG.getMergeValues({Result, Overflow}, SL);
7050}
7051
7052SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7053 if (Op->isDivergent()) {
7054 // Select to V_MAD_[IU]64_[IU]32.
7055 return Op;
7056 }
7057 if (Subtarget->hasSMulHi()) {
7058 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7059 return SDValue();
7060 }
7061 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7062 // calculate the high part, so we might as well do the whole thing with
7063 // V_MAD_[IU]64_[IU]32.
7064 return Op;
7065}
7066
7067SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7068 if (!Subtarget->isTrapHandlerEnabled() ||
7070 return lowerTrapEndpgm(Op, DAG);
7071
7072 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7073 : lowerTrapHsaQueuePtr(Op, DAG);
7074}
7075
7076SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7077 SDLoc SL(Op);
7078 SDValue Chain = Op.getOperand(0);
7079 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7080}
7081
7082SDValue
7083SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7084 const SDLoc &DL, Align Alignment,
7085 ImplicitParameter Param) const {
7088 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7090 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7093}
7094
7095SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7096 SelectionDAG &DAG) const {
7097 SDLoc SL(Op);
7098 SDValue Chain = Op.getOperand(0);
7099
7100 SDValue QueuePtr;
7101 // For code object version 5, QueuePtr is passed through implicit kernarg.
7102 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7104 QueuePtr =
7105 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7106 } else {
7109 Register UserSGPR = Info->getQueuePtrUserSGPR();
7110
7111 if (UserSGPR == AMDGPU::NoRegister) {
7112 // We probably are in a function incorrectly marked with
7113 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7114 // trap, so just use a null pointer.
7115 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7116 } else {
7117 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7118 MVT::i64);
7119 }
7120 }
7121
7122 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7123 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7124
7126 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7127 ToReg.getValue(1)};
7128 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7129}
7130
7131SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7132 SDLoc SL(Op);
7133 SDValue Chain = Op.getOperand(0);
7134
7135 // We need to simulate the 's_trap 2' instruction on targets that run in
7136 // PRIV=1 (where it is treated as a nop).
7137 if (Subtarget->hasPrivEnabledTrap2NopBug())
7138 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7139
7141 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7142 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7143}
7144
7145SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7146 SDLoc SL(Op);
7147 SDValue Chain = Op.getOperand(0);
7149
7150 if (!Subtarget->isTrapHandlerEnabled() ||
7153 "debugtrap handler not supported",
7154 Op.getDebugLoc(), DS_Warning);
7155 LLVMContext &Ctx = MF.getFunction().getContext();
7156 Ctx.diagnose(NoTrap);
7157 return Chain;
7158 }
7159
7160 uint64_t TrapID =
7162 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7163 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7164}
7165
7166SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7167 SelectionDAG &DAG) const {
7168 if (Subtarget->hasApertureRegs()) {
7169 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7170 ? AMDGPU::SRC_SHARED_BASE
7171 : AMDGPU::SRC_PRIVATE_BASE;
7172 // Note: this feature (register) is broken. When used as a 32-bit operand,
7173 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7174 // bits.
7175 //
7176 // To work around the issue, directly emit a 64 bit mov from this register
7177 // then extract the high bits. Note that this shouldn't even result in a
7178 // shift being emitted and simply become a pair of registers (e.g.):
7179 // s_mov_b64 s[6:7], src_shared_base
7180 // v_mov_b32_e32 v1, s7
7181 //
7182 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7183 // coalescing would kick in and it would think it's okay to use the "HI"
7184 // subregister directly (instead of extracting the HI 32 bits) which is an
7185 // artificial (unusable) register.
7186 // Register TableGen definitions would need an overhaul to get rid of the
7187 // artificial "HI" aperture registers and prevent this kind of issue from
7188 // happening.
7189 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7190 DAG.getRegister(ApertureRegNo, MVT::i64));
7191 return DAG.getNode(
7192 ISD::TRUNCATE, DL, MVT::i32,
7193 DAG.getNode(ISD::SRL, DL, MVT::i64,
7194 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7195 }
7196
7197 // For code object version 5, private_base and shared_base are passed through
7198 // implicit kernargs.
7199 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7203 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7204 }
7205
7208 Register UserSGPR = Info->getQueuePtrUserSGPR();
7209 if (UserSGPR == AMDGPU::NoRegister) {
7210 // We probably are in a function incorrectly marked with
7211 // amdgpu-no-queue-ptr. This is undefined.
7212 return DAG.getUNDEF(MVT::i32);
7213 }
7214
7215 SDValue QueuePtr =
7216 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7217
7218 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7219 // private_segment_aperture_base_hi.
7220 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7221
7222 SDValue Ptr =
7223 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7224
7225 // TODO: Use custom target PseudoSourceValue.
7226 // TODO: We should use the value from the IR intrinsic call, but it might not
7227 // be available and how do we get it?
7229 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7230 commonAlignment(Align(64), StructOffset),
7233}
7234
7235/// Return true if the value is a known valid address, such that a null check is
7236/// not necessary.
7238 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7239 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7240 isa<BasicBlockSDNode>(Val))
7241 return true;
7242
7243 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7244 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7245
7246 // TODO: Search through arithmetic, handle arguments and loads
7247 // marked nonnull.
7248 return false;
7249}
7250
7251SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7252 SelectionDAG &DAG) const {
7253 SDLoc SL(Op);
7254
7255 const AMDGPUTargetMachine &TM =
7256 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7257
7258 unsigned DestAS, SrcAS;
7259 SDValue Src;
7260 bool IsNonNull = false;
7261 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7262 SrcAS = ASC->getSrcAddressSpace();
7263 Src = ASC->getOperand(0);
7264 DestAS = ASC->getDestAddressSpace();
7265 } else {
7266 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7267 Op.getConstantOperandVal(0) ==
7268 Intrinsic::amdgcn_addrspacecast_nonnull);
7269 Src = Op->getOperand(1);
7270 SrcAS = Op->getConstantOperandVal(2);
7271 DestAS = Op->getConstantOperandVal(3);
7272 IsNonNull = true;
7273 }
7274
7275 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7276
7277 // flat -> local/private
7278 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7279 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7280 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7281 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7282
7283 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7284 return Ptr;
7285
7286 unsigned NullVal = TM.getNullPointerValue(DestAS);
7287 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7288 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7289
7290 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7291 SegmentNullPtr);
7292 }
7293 }
7294
7295 // local/private -> flat
7296 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7297 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7298 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7299
7300 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7301 SDValue CvtPtr =
7302 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7303 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7304
7305 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7306 return CvtPtr;
7307
7308 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7309 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7310
7311 SDValue NonNull =
7312 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7313
7314 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7315 FlatNullPtr);
7316 }
7317 }
7318
7319 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7320 Op.getValueType() == MVT::i64) {
7323 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7324 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7325 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7326 }
7327
7328 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7329 Src.getValueType() == MVT::i64)
7330 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7331
7332 // global <-> flat are no-ops and never emitted.
7333
7334 const MachineFunction &MF = DAG.getMachineFunction();
7335 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7336 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7337 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7338
7339 return DAG.getUNDEF(Op->getValueType(0));
7340}
7341
7342// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7343// the small vector and inserting them into the big vector. That is better than
7344// the default expansion of doing it via a stack slot. Even though the use of
7345// the stack slot would be optimized away afterwards, the stack slot itself
7346// remains.
7347SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7348 SelectionDAG &DAG) const {
7349 SDValue Vec = Op.getOperand(0);
7350 SDValue Ins = Op.getOperand(1);
7351 SDValue Idx = Op.getOperand(2);
7352 EVT VecVT = Vec.getValueType();
7353 EVT InsVT = Ins.getValueType();
7354 EVT EltVT = VecVT.getVectorElementType();
7355 unsigned InsNumElts = InsVT.getVectorNumElements();
7356 unsigned IdxVal = Idx->getAsZExtVal();
7357 SDLoc SL(Op);
7358
7359 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7360 // Insert 32-bit registers at a time.
7361 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7362
7363 unsigned VecNumElts = VecVT.getVectorNumElements();
7364 EVT NewVecVT =
7365 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7366 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7368 MVT::i32, InsNumElts / 2);
7369
7370 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7371 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7372
7373 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7374 SDValue Elt;
7375 if (InsNumElts == 2) {
7376 Elt = Ins;
7377 } else {
7378 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7379 DAG.getConstant(I, SL, MVT::i32));
7380 }
7381 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7382 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7383 }
7384
7385 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7386 }
7387
7388 for (unsigned I = 0; I != InsNumElts; ++I) {
7389 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7390 DAG.getConstant(I, SL, MVT::i32));
7391 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7392 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7393 }
7394 return Vec;
7395}
7396
7397SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7398 SelectionDAG &DAG) const {
7399 SDValue Vec = Op.getOperand(0);
7400 SDValue InsVal = Op.getOperand(1);
7401 SDValue Idx = Op.getOperand(2);
7402 EVT VecVT = Vec.getValueType();
7403 EVT EltVT = VecVT.getVectorElementType();
7404 unsigned VecSize = VecVT.getSizeInBits();
7405 unsigned EltSize = EltVT.getSizeInBits();
7406 SDLoc SL(Op);
7407
7408 // Specially handle the case of v4i16 with static indexing.
7409 unsigned NumElts = VecVT.getVectorNumElements();
7410 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7411 if (NumElts == 4 && EltSize == 16 && KIdx) {
7412 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7413
7414 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7415 DAG.getConstant(0, SL, MVT::i32));
7416 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7417 DAG.getConstant(1, SL, MVT::i32));
7418
7419 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7420 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7421
7422 unsigned Idx = KIdx->getZExtValue();
7423 bool InsertLo = Idx < 2;
7424 SDValue InsHalf = DAG.getNode(
7425 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7426 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7427 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7428
7429 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7430
7431 SDValue Concat =
7432 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7433 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7434
7435 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7436 }
7437
7438 // Static indexing does not lower to stack access, and hence there is no need
7439 // for special custom lowering to avoid stack access.
7440 if (isa<ConstantSDNode>(Idx))
7441 return SDValue();
7442
7443 // Avoid stack access for dynamic indexing by custom lowering to
7444 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7445
7446 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7447
7448 MVT IntVT = MVT::getIntegerVT(VecSize);
7449
7450 // Convert vector index to bit-index and get the required bit mask.
7451 assert(isPowerOf2_32(EltSize));
7452 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7453 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7454 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7455 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7456 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7457
7458 // 1. Create a congruent vector with the target value in each element.
7459 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7460 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7461
7462 // 2. Mask off all other indices except the required index within (1).
7463 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7464
7465 // 3. Mask off the required index within the target vector.
7466 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7467 SDValue RHS =
7468 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7469
7470 // 4. Get (2) and (3) ORed into the target vector.
7471 SDValue BFI =
7472 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
7473
7474 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7475}
7476
7477SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7478 SelectionDAG &DAG) const {
7479 SDLoc SL(Op);
7480
7481 EVT ResultVT = Op.getValueType();
7482 SDValue Vec = Op.getOperand(0);
7483 SDValue Idx = Op.getOperand(1);
7484 EVT VecVT = Vec.getValueType();
7485 unsigned VecSize = VecVT.getSizeInBits();
7486 EVT EltVT = VecVT.getVectorElementType();
7487
7488 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7489
7490 // Make sure we do any optimizations that will make it easier to fold
7491 // source modifiers before obscuring it with bit operations.
7492
7493 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7494 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7495 return Combined;
7496
7497 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7498 SDValue Lo, Hi;
7499 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7500
7501 if (VecSize == 128) {
7502 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7503 Lo = DAG.getBitcast(LoVT,
7504 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7505 DAG.getConstant(0, SL, MVT::i32)));
7506 Hi = DAG.getBitcast(HiVT,
7507 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7508 DAG.getConstant(1, SL, MVT::i32)));
7509 } else if (VecSize == 256) {
7510 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7511 SDValue Parts[4];
7512 for (unsigned P = 0; P < 4; ++P) {
7513 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7514 DAG.getConstant(P, SL, MVT::i32));
7515 }
7516
7517 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7518 Parts[0], Parts[1]));
7519 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7520 Parts[2], Parts[3]));
7521 } else {
7522 assert(VecSize == 512);
7523
7524 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7525 SDValue Parts[8];
7526 for (unsigned P = 0; P < 8; ++P) {
7527 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7528 DAG.getConstant(P, SL, MVT::i32));
7529 }
7530
7531 Lo = DAG.getBitcast(LoVT,
7532 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7533 Parts[0], Parts[1], Parts[2], Parts[3]));
7534 Hi = DAG.getBitcast(HiVT,
7535 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7536 Parts[4], Parts[5], Parts[6], Parts[7]));
7537 }
7538
7539 EVT IdxVT = Idx.getValueType();
7540 unsigned NElem = VecVT.getVectorNumElements();
7541 assert(isPowerOf2_32(NElem));
7542 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7543 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7544 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7545 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7546 }
7547
7548 assert(VecSize <= 64);
7549
7550 MVT IntVT = MVT::getIntegerVT(VecSize);
7551
7552 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7553 SDValue VecBC = peekThroughBitcasts(Vec);
7554 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7555 SDValue Src = VecBC.getOperand(0);
7556 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7557 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7558 }
7559
7560 unsigned EltSize = EltVT.getSizeInBits();
7561 assert(isPowerOf2_32(EltSize));
7562
7563 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7564
7565 // Convert vector index to bit-index (* EltSize)
7566 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7567
7568 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7569 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7570
7571 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7572 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7573 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7574 }
7575
7576 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7577}
7578
7579static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7580 assert(Elt % 2 == 0);
7581 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7582}
7583
7584SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7585 SelectionDAG &DAG) const {
7586 SDLoc SL(Op);
7587 EVT ResultVT = Op.getValueType();
7588 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7589 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7590 MVT PackVT = MVT::getVectorVT(EltVT, 2);
7591 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7592
7593 // vector_shuffle <0,1,6,7> lhs, rhs
7594 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7595 //
7596 // vector_shuffle <6,7,2,3> lhs, rhs
7597 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7598 //
7599 // vector_shuffle <6,7,0,1> lhs, rhs
7600 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7601
7602 // Avoid scalarizing when both halves are reading from consecutive elements.
7604 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7605 if (elementPairIsContiguous(SVN->getMask(), I)) {
7606 const int Idx = SVN->getMaskElt(I);
7607 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7608 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7609 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7610 SVN->getOperand(VecIdx),
7611 DAG.getConstant(EltIdx, SL, MVT::i32));
7612 Pieces.push_back(SubVec);
7613 } else {
7614 const int Idx0 = SVN->getMaskElt(I);
7615 const int Idx1 = SVN->getMaskElt(I + 1);
7616 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7617 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7618 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7619 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7620
7621 SDValue Vec0 = SVN->getOperand(VecIdx0);
7622 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
7623 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
7624
7625 SDValue Vec1 = SVN->getOperand(VecIdx1);
7626 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
7627 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
7628 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
7629 }
7630 }
7631
7632 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7633}
7634
7635SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7636 SelectionDAG &DAG) const {
7637 SDValue SVal = Op.getOperand(0);
7638 EVT ResultVT = Op.getValueType();
7639 EVT SValVT = SVal.getValueType();
7640 SDValue UndefVal = DAG.getUNDEF(SValVT);
7641 SDLoc SL(Op);
7642
7644 VElts.push_back(SVal);
7645 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7646 VElts.push_back(UndefVal);
7647
7648 return DAG.getBuildVector(ResultVT, SL, VElts);
7649}
7650
7651SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7652 SelectionDAG &DAG) const {
7653 SDLoc SL(Op);
7654 EVT VT = Op.getValueType();
7655
7656 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7657 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7658
7659 SDValue Lo = Op.getOperand(0);
7660 SDValue Hi = Op.getOperand(1);
7661
7662 // Avoid adding defined bits with the zero_extend.
7663 if (Hi.isUndef()) {
7664 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7665 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7666 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7667 }
7668
7669 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7670 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7671
7672 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7673 DAG.getConstant(16, SL, MVT::i32));
7674 if (Lo.isUndef())
7675 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7676
7677 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7678 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7679
7680 SDValue Or =
7681 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
7682 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7683 }
7684
7685 // Split into 2-element chunks.
7686 const unsigned NumParts = VT.getVectorNumElements() / 2;
7688 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7689
7691 for (unsigned P = 0; P < NumParts; ++P) {
7692 SDValue Vec = DAG.getBuildVector(
7693 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7694 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7695 }
7696
7697 SDValue Blend =
7698 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7699 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7700}
7701
7703 const GlobalAddressSDNode *GA) const {
7704 // OSes that use ELF REL relocations (instead of RELA) can only store a
7705 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7706 // which can create arbitrary 64-bit addends. (This is only a problem for
7707 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7708 // the high 32 bits of the addend.)
7709 //
7710 // This should be kept in sync with how HasRelocationAddend is initialized in
7711 // the constructor of ELFAMDGPUAsmBackend.
7712 if (!Subtarget->isAmdHsaOS())
7713 return false;
7714
7715 // We can fold offsets for anything that doesn't require a GOT relocation.
7716 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7720}
7721
7722static SDValue
7724 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7725 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7726 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7727 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7728 // lowered to the following code sequence:
7729 //
7730 // For constant address space:
7731 // s_getpc_b64 s[0:1]
7732 // s_add_u32 s0, s0, $symbol
7733 // s_addc_u32 s1, s1, 0
7734 //
7735 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7736 // a fixup or relocation is emitted to replace $symbol with a literal
7737 // constant, which is a pc-relative offset from the encoding of the $symbol
7738 // operand to the global variable.
7739 //
7740 // For global address space:
7741 // s_getpc_b64 s[0:1]
7742 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7743 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7744 //
7745 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7746 // fixups or relocations are emitted to replace $symbol@*@lo and
7747 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7748 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7749 // operand to the global variable.
7750 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7751 SDValue PtrHi;
7752 if (GAFlags == SIInstrInfo::MO_NONE)
7753 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7754 else
7755 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7756 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7757}
7758
7759SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7760 SDValue Op,
7761 SelectionDAG &DAG) const {
7762 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7763 SDLoc DL(GSD);
7764 EVT PtrVT = Op.getValueType();
7765
7766 const GlobalValue *GV = GSD->getGlobal();
7772 GV->hasExternalLinkage()) {
7773 Type *Ty = GV->getValueType();
7774 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7775 // zero-sized type in other languages to declare the dynamic shared
7776 // memory which size is not known at the compile time. They will be
7777 // allocated by the runtime and placed directly after the static
7778 // allocated ones. They all share the same offset.
7779 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7780 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7781 // Adjust alignment for that dynamic shared memory array.
7783 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7784 MFI->setUsesDynamicLDS(true);
7785 return SDValue(
7786 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7787 }
7788 }
7790 }
7791
7793 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7795 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7796 }
7797
7798 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7799 SDValue AddrLo = DAG.getTargetGlobalAddress(
7800 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7801 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7802
7803 SDValue AddrHi = DAG.getTargetGlobalAddress(
7804 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7805 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7806
7807 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7808 }
7809
7810 if (shouldEmitFixup(GV))
7811 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7812
7813 if (shouldEmitPCReloc(GV))
7814 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7816
7817 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7819
7820 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7822 const DataLayout &DataLayout = DAG.getDataLayout();
7823 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7824 MachinePointerInfo PtrInfo =
7826
7827 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7830}
7831
7833 const SDLoc &DL, SDValue V) const {
7834 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7835 // the destination register.
7836 //
7837 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7838 // so we will end up with redundant moves to m0.
7839 //
7840 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7841
7842 // A Null SDValue creates a glue result.
7843 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7844 V, Chain);
7845 return SDValue(M0, 0);
7846}
7847
7848SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
7849 MVT VT,
7850 unsigned Offset) const {
7851 SDLoc SL(Op);
7852 SDValue Param = lowerKernargMemParameter(
7853 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7854 // The local size values will have the hi 16-bits as zero.
7855 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7856 DAG.getValueType(VT));
7857}
7858
7860 EVT VT) {
7862 "non-hsa intrinsic with hsa target",
7863 DL.getDebugLoc());
7864 DAG.getContext()->diagnose(BadIntrin);
7865 return DAG.getUNDEF(VT);
7866}
7867
7869 EVT VT) {
7871 "intrinsic not supported on subtarget",
7872 DL.getDebugLoc());
7873 DAG.getContext()->diagnose(BadIntrin);
7874 return DAG.getUNDEF(VT);
7875}
7876
7878 ArrayRef<SDValue> Elts) {
7879 assert(!Elts.empty());
7880 MVT Type;
7881 unsigned NumElts = Elts.size();
7882
7883 if (NumElts <= 12) {
7884 Type = MVT::getVectorVT(MVT::f32, NumElts);
7885 } else {
7886 assert(Elts.size() <= 16);
7887 Type = MVT::v16f32;
7888 NumElts = 16;
7889 }
7890
7891 SmallVector<SDValue, 16> VecElts(NumElts);
7892 for (unsigned i = 0; i < Elts.size(); ++i) {
7893 SDValue Elt = Elts[i];
7894 if (Elt.getValueType() != MVT::f32)
7895 Elt = DAG.getBitcast(MVT::f32, Elt);
7896 VecElts[i] = Elt;
7897 }
7898 for (unsigned i = Elts.size(); i < NumElts; ++i)
7899 VecElts[i] = DAG.getUNDEF(MVT::f32);
7900
7901 if (NumElts == 1)
7902 return VecElts[0];
7903 return DAG.getBuildVector(Type, DL, VecElts);
7904}
7905
7906static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7907 SDValue Src, int ExtraElts) {
7908 EVT SrcVT = Src.getValueType();
7909
7911
7912 if (SrcVT.isVector())
7913 DAG.ExtractVectorElements(Src, Elts);
7914 else
7915 Elts.push_back(Src);
7916
7917 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7918 while (ExtraElts--)
7919 Elts.push_back(Undef);
7920
7921 return DAG.getBuildVector(CastVT, DL, Elts);
7922}
7923
7924// Re-construct the required return value for a image load intrinsic.
7925// This is more complicated due to the optional use TexFailCtrl which means the
7926// required return type is an aggregate
7928 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7929 bool Unpacked, bool IsD16, int DMaskPop,
7930 int NumVDataDwords, bool IsAtomicPacked16Bit,
7931 const SDLoc &DL) {
7932 // Determine the required return type. This is the same regardless of
7933 // IsTexFail flag
7934 EVT ReqRetVT = ResultTypes[0];
7935 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7936 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7937 ? (ReqRetNumElts + 1) / 2
7938 : ReqRetNumElts;
7939
7940 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7941
7942 MVT DataDwordVT =
7943 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7944
7945 MVT MaskPopVT =
7946 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7947
7948 SDValue Data(Result, 0);
7949 SDValue TexFail;
7950
7951 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7952 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7953 if (MaskPopVT.isVector()) {
7954 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7955 SDValue(Result, 0), ZeroIdx);
7956 } else {
7957 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7958 SDValue(Result, 0), ZeroIdx);
7959 }
7960 }
7961
7962 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7963 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7964 NumDataDwords - MaskPopDwords);
7965
7966 if (IsD16)
7967 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7968
7969 EVT LegalReqRetVT = ReqRetVT;
7970 if (!ReqRetVT.isVector()) {
7971 if (!Data.getValueType().isInteger())
7972 Data = DAG.getNode(ISD::BITCAST, DL,
7973 Data.getValueType().changeTypeToInteger(), Data);
7974 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7975 } else {
7976 // We need to widen the return vector to a legal type
7977 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7978 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7979 LegalReqRetVT =
7981 ReqRetVT.getVectorNumElements() + 1);
7982 }
7983 }
7984 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7985
7986 if (IsTexFail) {
7987 TexFail =
7988 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7989 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7990
7991 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7992 }
7993
7994 if (Result->getNumValues() == 1)
7995 return Data;
7996
7997 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7998}
7999
8000static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8001 SDValue *LWE, bool &IsTexFail) {
8002 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8003
8004 uint64_t Value = TexFailCtrlConst->getZExtValue();
8005 if (Value) {
8006 IsTexFail = true;
8007 }
8008
8009 SDLoc DL(TexFailCtrlConst);
8010 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8011 Value &= ~(uint64_t)0x1;
8012 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8013 Value &= ~(uint64_t)0x2;
8014
8015 return Value == 0;
8016}
8017
8019 MVT PackVectorVT,
8020 SmallVectorImpl<SDValue> &PackedAddrs,
8021 unsigned DimIdx, unsigned EndIdx,
8022 unsigned NumGradients) {
8023 SDLoc DL(Op);
8024 for (unsigned I = DimIdx; I < EndIdx; I++) {
8025 SDValue Addr = Op.getOperand(I);
8026
8027 // Gradients are packed with undef for each coordinate.
8028 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8029 // 1D: undef,dx/dh; undef,dx/dv
8030 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8031 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8032 if (((I + 1) >= EndIdx) ||
8033 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8034 I == DimIdx + NumGradients - 1))) {
8035 if (Addr.getValueType() != MVT::i16)
8036 Addr = DAG.getBitcast(MVT::i16, Addr);
8037 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8038 } else {
8039 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8040 I++;
8041 }
8042 Addr = DAG.getBitcast(MVT::f32, Addr);
8043 PackedAddrs.push_back(Addr);
8044 }
8045}
8046
8047SDValue SITargetLowering::lowerImage(SDValue Op,
8049 SelectionDAG &DAG, bool WithChain) const {
8050 SDLoc DL(Op);
8052 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8053 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8055 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8056 unsigned IntrOpcode = Intr->BaseOpcode;
8057 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8058 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8059 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8060
8061 SmallVector<EVT, 3> ResultTypes(Op->values());
8062 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8063 bool IsD16 = false;
8064 bool IsG16 = false;
8065 bool IsA16 = false;
8066 SDValue VData;
8067 int NumVDataDwords = 0;
8068 bool AdjustRetType = false;
8069 bool IsAtomicPacked16Bit = false;
8070
8071 // Offset of intrinsic arguments
8072 const unsigned ArgOffset = WithChain ? 2 : 1;
8073
8074 unsigned DMask;
8075 unsigned DMaskLanes = 0;
8076
8077 if (BaseOpcode->Atomic) {
8078 VData = Op.getOperand(2);
8079
8080 IsAtomicPacked16Bit =
8081 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8082 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8083
8084 bool Is64Bit = VData.getValueSizeInBits() == 64;
8085 if (BaseOpcode->AtomicX2) {
8086 SDValue VData2 = Op.getOperand(3);
8087 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8088 {VData, VData2});
8089 if (Is64Bit)
8090 VData = DAG.getBitcast(MVT::v4i32, VData);
8091
8092 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8093 DMask = Is64Bit ? 0xf : 0x3;
8094 NumVDataDwords = Is64Bit ? 4 : 2;
8095 } else {
8096 DMask = Is64Bit ? 0x3 : 0x1;
8097 NumVDataDwords = Is64Bit ? 2 : 1;
8098 }
8099 } else {
8100 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8101 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8102
8103 if (BaseOpcode->Store) {
8104 VData = Op.getOperand(2);
8105
8106 MVT StoreVT = VData.getSimpleValueType();
8107 if (StoreVT.getScalarType() == MVT::f16) {
8108 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8109 return Op; // D16 is unsupported for this instruction
8110
8111 IsD16 = true;
8112 VData = handleD16VData(VData, DAG, true);
8113 }
8114
8115 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8116 } else if (!BaseOpcode->NoReturn) {
8117 // Work out the num dwords based on the dmask popcount and underlying type
8118 // and whether packing is supported.
8119 MVT LoadVT = ResultTypes[0].getSimpleVT();
8120 if (LoadVT.getScalarType() == MVT::f16) {
8121 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8122 return Op; // D16 is unsupported for this instruction
8123
8124 IsD16 = true;
8125 }
8126
8127 // Confirm that the return type is large enough for the dmask specified
8128 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8129 (!LoadVT.isVector() && DMaskLanes > 1))
8130 return Op;
8131
8132 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8133 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8134 // instructions.
8135 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8136 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8137 NumVDataDwords = (DMaskLanes + 1) / 2;
8138 else
8139 NumVDataDwords = DMaskLanes;
8140
8141 AdjustRetType = true;
8142 }
8143 }
8144
8145 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8147
8148 // Check for 16 bit addresses or derivatives and pack if true.
8149 MVT VAddrVT =
8150 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8151 MVT VAddrScalarVT = VAddrVT.getScalarType();
8152 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8153 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8154
8155 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8156 VAddrScalarVT = VAddrVT.getScalarType();
8157 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8158 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8159
8160 // Push back extra arguments.
8161 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8162 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8163 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8164 // Special handling of bias when A16 is on. Bias is of type half but
8165 // occupies full 32-bit.
8166 SDValue Bias = DAG.getBuildVector(
8167 MVT::v2f16, DL,
8168 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8169 VAddrs.push_back(Bias);
8170 } else {
8171 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8172 "Bias needs to be converted to 16 bit in A16 mode");
8173 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8174 }
8175 }
8176
8177 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8178 // 16 bit gradients are supported, but are tied to the A16 control
8179 // so both gradients and addresses must be 16 bit
8180 LLVM_DEBUG(
8181 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8182 "require 16 bit args for both gradients and addresses");
8183 return Op;
8184 }
8185
8186 if (IsA16) {
8187 if (!ST->hasA16()) {
8188 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8189 "support 16 bit addresses\n");
8190 return Op;
8191 }
8192 }
8193
8194 // We've dealt with incorrect input so we know that if IsA16, IsG16
8195 // are set then we have to compress/pack operands (either address,
8196 // gradient or both)
8197 // In the case where a16 and gradients are tied (no G16 support) then we
8198 // have already verified that both IsA16 and IsG16 are true
8199 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8200 // Activate g16
8201 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8203 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8204 }
8205
8206 // Add gradients (packed or unpacked)
8207 if (IsG16) {
8208 // Pack the gradients
8209 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8210 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8211 ArgOffset + Intr->GradientStart,
8212 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8213 } else {
8214 for (unsigned I = ArgOffset + Intr->GradientStart;
8215 I < ArgOffset + Intr->CoordStart; I++)
8216 VAddrs.push_back(Op.getOperand(I));
8217 }
8218
8219 // Add addresses (packed or unpacked)
8220 if (IsA16) {
8221 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8222 ArgOffset + Intr->CoordStart, VAddrEnd,
8223 0 /* No gradients */);
8224 } else {
8225 // Add uncompressed address
8226 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8227 VAddrs.push_back(Op.getOperand(I));
8228 }
8229
8230 // If the register allocator cannot place the address registers contiguously
8231 // without introducing moves, then using the non-sequential address encoding
8232 // is always preferable, since it saves VALU instructions and is usually a
8233 // wash in terms of code size or even better.
8234 //
8235 // However, we currently have no way of hinting to the register allocator that
8236 // MIMG addresses should be placed contiguously when it is possible to do so,
8237 // so force non-NSA for the common 2-address case as a heuristic.
8238 //
8239 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8240 // allocation when possible.
8241 //
8242 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8243 // set of the remaining addresses.
8244 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8245 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8246 const bool UseNSA = ST->hasNSAEncoding() &&
8247 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8248 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8249 const bool UsePartialNSA =
8250 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8251
8252 SDValue VAddr;
8253 if (UsePartialNSA) {
8254 VAddr = getBuildDwordsVector(DAG, DL,
8255 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8256 } else if (!UseNSA) {
8257 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8258 }
8259
8260 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8261 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8262 SDValue Unorm;
8263 if (!BaseOpcode->Sampler) {
8264 Unorm = True;
8265 } else {
8266 uint64_t UnormConst =
8267 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8268
8269 Unorm = UnormConst ? True : False;
8270 }
8271
8272 SDValue TFE;
8273 SDValue LWE;
8274 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8275 bool IsTexFail = false;
8276 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8277 return Op;
8278
8279 if (IsTexFail) {
8280 if (!DMaskLanes) {
8281 // Expecting to get an error flag since TFC is on - and dmask is 0
8282 // Force dmask to be at least 1 otherwise the instruction will fail
8283 DMask = 0x1;
8284 DMaskLanes = 1;
8285 NumVDataDwords = 1;
8286 }
8287 NumVDataDwords += 1;
8288 AdjustRetType = true;
8289 }
8290
8291 // Has something earlier tagged that the return type needs adjusting
8292 // This happens if the instruction is a load or has set TexFailCtrl flags
8293 if (AdjustRetType) {
8294 // NumVDataDwords reflects the true number of dwords required in the return
8295 // type
8296 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8297 // This is a no-op load. This can be eliminated
8298 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8299 if (isa<MemSDNode>(Op))
8300 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8301 return Undef;
8302 }
8303
8304 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8305 MVT::i32, NumVDataDwords)
8306 : MVT::i32;
8307
8308 ResultTypes[0] = NewVT;
8309 if (ResultTypes.size() == 3) {
8310 // Original result was aggregate type used for TexFailCtrl results
8311 // The actual instruction returns as a vector type which has now been
8312 // created. Remove the aggregate result.
8313 ResultTypes.erase(&ResultTypes[1]);
8314 }
8315 }
8316
8317 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8318 if (BaseOpcode->Atomic)
8319 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8320 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8322 return Op;
8323
8325 if (BaseOpcode->Store || BaseOpcode->Atomic)
8326 Ops.push_back(VData); // vdata
8327 if (UsePartialNSA) {
8328 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8329 Ops.push_back(VAddr);
8330 } else if (UseNSA)
8331 append_range(Ops, VAddrs);
8332 else
8333 Ops.push_back(VAddr);
8334 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8335 EVT RsrcVT = Rsrc.getValueType();
8336 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8337 return Op;
8338 Ops.push_back(Rsrc);
8339 if (BaseOpcode->Sampler) {
8340 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8341 if (Samp.getValueType() != MVT::v4i32)
8342 return Op;
8343 Ops.push_back(Samp);
8344 }
8345 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8346 if (IsGFX10Plus)
8347 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8348 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8349 Ops.push_back(Unorm);
8350 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8351 Ops.push_back(IsA16 && // r128, a16 for gfx9
8352 ST->hasFeature(AMDGPU::FeatureR128A16)
8353 ? True
8354 : False);
8355 if (IsGFX10Plus)
8356 Ops.push_back(IsA16 ? True : False);
8357 if (!Subtarget->hasGFX90AInsts()) {
8358 Ops.push_back(TFE); // tfe
8359 } else if (TFE->getAsZExtVal()) {
8360 report_fatal_error("TFE is not supported on this GPU");
8361 }
8362 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8363 Ops.push_back(LWE); // lwe
8364 if (!IsGFX10Plus)
8365 Ops.push_back(DimInfo->DA ? True : False);
8366 if (BaseOpcode->HasD16)
8367 Ops.push_back(IsD16 ? True : False);
8368 if (isa<MemSDNode>(Op))
8369 Ops.push_back(Op.getOperand(0)); // chain
8370
8371 int NumVAddrDwords =
8372 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8373 int Opcode = -1;
8374
8375 if (IsGFX12Plus) {
8376 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8377 NumVDataDwords, NumVAddrDwords);
8378 } else if (IsGFX11Plus) {
8379 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8380 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8381 : AMDGPU::MIMGEncGfx11Default,
8382 NumVDataDwords, NumVAddrDwords);
8383 } else if (IsGFX10Plus) {
8384 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8385 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8386 : AMDGPU::MIMGEncGfx10Default,
8387 NumVDataDwords, NumVAddrDwords);
8388 } else {
8389 if (Subtarget->hasGFX90AInsts()) {
8390 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8391 NumVDataDwords, NumVAddrDwords);
8392 if (Opcode == -1)
8394 "requested image instruction is not supported on this GPU");
8395 }
8396 if (Opcode == -1 &&
8398 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8399 NumVDataDwords, NumVAddrDwords);
8400 if (Opcode == -1)
8401 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8402 NumVDataDwords, NumVAddrDwords);
8403 }
8404 if (Opcode == -1)
8405 return Op;
8406
8407 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8408 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8409 MachineMemOperand *MemRef = MemOp->getMemOperand();
8410 DAG.setNodeMemRefs(NewNode, {MemRef});
8411 }
8412
8413 if (BaseOpcode->AtomicX2) {
8415 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8416 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8417 }
8418 if (BaseOpcode->NoReturn)
8419 return SDValue(NewNode, 0);
8420 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8421 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8422 NumVDataDwords, IsAtomicPacked16Bit, DL);
8423}
8424
8425SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8426 SDValue Offset, SDValue CachePolicy,
8427 SelectionDAG &DAG) const {
8429
8430 const DataLayout &DataLayout = DAG.getDataLayout();
8431 Align Alignment =
8433
8438 VT.getStoreSize(), Alignment);
8439
8440 if (!Offset->isDivergent()) {
8441 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8442
8443 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8444 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8445 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8446 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8447 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8448 SDValue BufferLoad =
8450 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8451 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8452 }
8453
8454 // Widen vec3 load to vec4.
8455 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8456 !Subtarget->hasScalarDwordx3Loads()) {
8457 EVT WidenedVT =
8459 auto WidenedOp = DAG.getMemIntrinsicNode(
8460 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8461 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8462 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8463 DAG.getVectorIdxConstant(0, DL));
8464 return Subvector;
8465 }
8466
8468 DAG.getVTList(VT), Ops, VT, MMO);
8469 }
8470
8471 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8472 // assume that the buffer is unswizzled.
8473 SDValue Ops[] = {
8474 DAG.getEntryNode(), // Chain
8475 Rsrc, // rsrc
8476 DAG.getConstant(0, DL, MVT::i32), // vindex
8477 {}, // voffset
8478 {}, // soffset
8479 {}, // offset
8480 CachePolicy, // cachepolicy
8481 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8482 };
8483 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8484 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8485 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8486 }
8487
8489 unsigned NumLoads = 1;
8490 MVT LoadVT = VT.getSimpleVT();
8491 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8492 assert((LoadVT.getScalarType() == MVT::i32 ||
8493 LoadVT.getScalarType() == MVT::f32));
8494
8495 if (NumElts == 8 || NumElts == 16) {
8496 NumLoads = NumElts / 4;
8497 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8498 }
8499
8500 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8501
8502 // Use the alignment to ensure that the required offsets will fit into the
8503 // immediate offsets.
8504 setBufferOffsets(Offset, DAG, &Ops[3],
8505 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8506
8507 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8508 for (unsigned i = 0; i < NumLoads; ++i) {
8509 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8510 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8511 LoadVT, MMO, DAG));
8512 }
8513
8514 if (NumElts == 8 || NumElts == 16)
8515 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8516
8517 return Loads[0];
8518}
8519
8520SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8521 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8522 if (!Subtarget->hasArchitectedSGPRs())
8523 return {};
8524 SDLoc SL(Op);
8525 MVT VT = MVT::i32;
8526 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8527 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8528 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8529}
8530
8531SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8532 unsigned Dim,
8533 const ArgDescriptor &Arg) const {
8534 SDLoc SL(Op);
8536 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8537 if (MaxID == 0)
8538 return DAG.getConstant(0, SL, MVT::i32);
8539
8540 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8541 SDLoc(DAG.getEntryNode()), Arg);
8542
8543 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8544 // masking operations anyway.
8545 //
8546 // TODO: We could assert the top bit is 0 for the source copy.
8547 if (Arg.isMasked())
8548 return Val;
8549
8550 // Preserve the known bits after expansion to a copy.
8552 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8553 DAG.getValueType(SmallVT));
8554}
8555
8556SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8557 SelectionDAG &DAG) const {
8559 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8560
8561 EVT VT = Op.getValueType();
8562 SDLoc DL(Op);
8563 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8564
8565 // TODO: Should this propagate fast-math-flags?
8566
8567 switch (IntrinsicID) {
8568 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8569 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8570 return emitNonHSAIntrinsicError(DAG, DL, VT);
8571 return getPreloadedValue(DAG, *MFI, VT,
8573 }
8574 case Intrinsic::amdgcn_dispatch_ptr:
8575 case Intrinsic::amdgcn_queue_ptr: {
8576 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8577 DiagnosticInfoUnsupported BadIntrin(
8578 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8579 DL.getDebugLoc());
8580 DAG.getContext()->diagnose(BadIntrin);
8581 return DAG.getUNDEF(VT);
8582 }
8583
8584 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8587 return getPreloadedValue(DAG, *MFI, VT, RegID);
8588 }
8589 case Intrinsic::amdgcn_implicitarg_ptr: {
8590 if (MFI->isEntryFunction())
8591 return getImplicitArgPtr(DAG, DL);
8592 return getPreloadedValue(DAG, *MFI, VT,
8594 }
8595 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8597 // This only makes sense to call in a kernel, so just lower to null.
8598 return DAG.getConstant(0, DL, VT);
8599 }
8600
8601 return getPreloadedValue(DAG, *MFI, VT,
8603 }
8604 case Intrinsic::amdgcn_dispatch_id: {
8605 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8606 }
8607 case Intrinsic::amdgcn_rcp:
8608 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8609 case Intrinsic::amdgcn_rsq:
8610 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8611 case Intrinsic::amdgcn_rsq_legacy:
8613 return emitRemovedIntrinsicError(DAG, DL, VT);
8614 return SDValue();
8615 case Intrinsic::amdgcn_rcp_legacy:
8617 return emitRemovedIntrinsicError(DAG, DL, VT);
8618 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8619 case Intrinsic::amdgcn_rsq_clamp: {
8621 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8622
8623 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8626
8627 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8628 SDValue Tmp =
8629 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
8630 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8631 DAG.getConstantFP(Min, DL, VT));
8632 }
8633 case Intrinsic::r600_read_ngroups_x:
8634 if (Subtarget->isAmdHsaOS())
8635 return emitNonHSAIntrinsicError(DAG, DL, VT);
8636
8637 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8639 false);
8640 case Intrinsic::r600_read_ngroups_y:
8641 if (Subtarget->isAmdHsaOS())
8642 return emitNonHSAIntrinsicError(DAG, DL, VT);
8643
8644 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8646 false);
8647 case Intrinsic::r600_read_ngroups_z:
8648 if (Subtarget->isAmdHsaOS())
8649 return emitNonHSAIntrinsicError(DAG, DL, VT);
8650
8651 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8653 false);
8654 case Intrinsic::r600_read_global_size_x:
8655 if (Subtarget->isAmdHsaOS())
8656 return emitNonHSAIntrinsicError(DAG, DL, VT);
8657
8658 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8660 Align(4), false);
8661 case Intrinsic::r600_read_global_size_y:
8662 if (Subtarget->isAmdHsaOS())
8663 return emitNonHSAIntrinsicError(DAG, DL, VT);
8664
8665 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8667 Align(4), false);
8668 case Intrinsic::r600_read_global_size_z:
8669 if (Subtarget->isAmdHsaOS())
8670 return emitNonHSAIntrinsicError(DAG, DL, VT);
8671
8672 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8674 Align(4), false);
8675 case Intrinsic::r600_read_local_size_x:
8676 if (Subtarget->isAmdHsaOS())
8677 return emitNonHSAIntrinsicError(DAG, DL, VT);
8678
8679 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8681 case Intrinsic::r600_read_local_size_y:
8682 if (Subtarget->isAmdHsaOS())
8683 return emitNonHSAIntrinsicError(DAG, DL, VT);
8684
8685 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8687 case Intrinsic::r600_read_local_size_z:
8688 if (Subtarget->isAmdHsaOS())
8689 return emitNonHSAIntrinsicError(DAG, DL, VT);
8690
8691 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8693 case Intrinsic::amdgcn_workgroup_id_x:
8694 return getPreloadedValue(DAG, *MFI, VT,
8696 case Intrinsic::amdgcn_workgroup_id_y:
8697 return getPreloadedValue(DAG, *MFI, VT,
8699 case Intrinsic::amdgcn_workgroup_id_z:
8700 return getPreloadedValue(DAG, *MFI, VT,
8702 case Intrinsic::amdgcn_wave_id:
8703 return lowerWaveID(DAG, Op);
8704 case Intrinsic::amdgcn_lds_kernel_id: {
8705 if (MFI->isEntryFunction())
8706 return getLDSKernelId(DAG, DL);
8707 return getPreloadedValue(DAG, *MFI, VT,
8709 }
8710 case Intrinsic::amdgcn_workitem_id_x:
8711 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8712 case Intrinsic::amdgcn_workitem_id_y:
8713 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8714 case Intrinsic::amdgcn_workitem_id_z:
8715 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8716 case Intrinsic::amdgcn_wavefrontsize:
8718 SDLoc(Op), MVT::i32);
8719 case Intrinsic::amdgcn_s_buffer_load: {
8720 unsigned CPol = Op.getConstantOperandVal(3);
8721 // s_buffer_load, because of how it's optimized, can't be volatile
8722 // so reject ones with the volatile bit set.
8723 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8726 return Op;
8727 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
8728 Op.getOperand(3), DAG);
8729 }
8730 case Intrinsic::amdgcn_fdiv_fast:
8731 return lowerFDIV_FAST(Op, DAG);
8732 case Intrinsic::amdgcn_sin:
8733 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8734
8735 case Intrinsic::amdgcn_cos:
8736 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8737
8738 case Intrinsic::amdgcn_mul_u24:
8739 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
8740 Op.getOperand(2));
8741 case Intrinsic::amdgcn_mul_i24:
8742 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
8743 Op.getOperand(2));
8744
8745 case Intrinsic::amdgcn_log_clamp: {
8747 return SDValue();
8748
8749 return emitRemovedIntrinsicError(DAG, DL, VT);
8750 }
8751 case Intrinsic::amdgcn_fract:
8752 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8753
8754 case Intrinsic::amdgcn_class:
8755 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
8756 Op.getOperand(2));
8757 case Intrinsic::amdgcn_div_fmas:
8758 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
8759 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8760
8761 case Intrinsic::amdgcn_div_fixup:
8762 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
8763 Op.getOperand(2), Op.getOperand(3));
8764
8765 case Intrinsic::amdgcn_div_scale: {
8766 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8767
8768 // Translate to the operands expected by the machine instruction. The
8769 // first parameter must be the same as the first instruction.
8770 SDValue Numerator = Op.getOperand(1);
8771 SDValue Denominator = Op.getOperand(2);
8772
8773 // Note this order is opposite of the machine instruction's operations,
8774 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8775 // intrinsic has the numerator as the first operand to match a normal
8776 // division operation.
8777
8778 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8779
8780 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8781 Denominator, Numerator);
8782 }
8783 case Intrinsic::amdgcn_icmp: {
8784 // There is a Pat that handles this variant, so return it as-is.
8785 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8786 Op.getConstantOperandVal(2) == 0 &&
8787 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8788 return Op;
8789 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8790 }
8791 case Intrinsic::amdgcn_fcmp: {
8792 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8793 }
8794 case Intrinsic::amdgcn_ballot:
8795 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8796 case Intrinsic::amdgcn_fmed3:
8797 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
8798 Op.getOperand(2), Op.getOperand(3));
8799 case Intrinsic::amdgcn_fdot2:
8800 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
8801 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8802 case Intrinsic::amdgcn_fmul_legacy:
8803 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
8804 Op.getOperand(2));
8805 case Intrinsic::amdgcn_sffbh:
8806 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8807 case Intrinsic::amdgcn_sbfe:
8808 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
8809 Op.getOperand(2), Op.getOperand(3));
8810 case Intrinsic::amdgcn_ubfe:
8811 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
8812 Op.getOperand(2), Op.getOperand(3));
8813 case Intrinsic::amdgcn_cvt_pkrtz:
8814 case Intrinsic::amdgcn_cvt_pknorm_i16:
8815 case Intrinsic::amdgcn_cvt_pknorm_u16:
8816 case Intrinsic::amdgcn_cvt_pk_i16:
8817 case Intrinsic::amdgcn_cvt_pk_u16: {
8818 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8819 EVT VT = Op.getValueType();
8820 unsigned Opcode;
8821
8822 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8824 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8826 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8828 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8830 else
8832
8833 if (isTypeLegal(VT))
8834 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8835
8836 SDValue Node =
8837 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
8838 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8839 }
8840 case Intrinsic::amdgcn_fmad_ftz:
8841 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8842 Op.getOperand(2), Op.getOperand(3));
8843
8844 case Intrinsic::amdgcn_if_break:
8845 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8846 Op->getOperand(1), Op->getOperand(2)),
8847 0);
8848
8849 case Intrinsic::amdgcn_groupstaticsize: {
8851 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8852 return Op;
8853
8854 const Module *M = MF.getFunction().getParent();
8855 const GlobalValue *GV =
8856 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
8857 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8859 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8860 }
8861 case Intrinsic::amdgcn_is_shared:
8862 case Intrinsic::amdgcn_is_private: {
8863 SDLoc SL(Op);
8864 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8867 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8868 SDValue SrcVec =
8869 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8870
8871 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8872 DAG.getConstant(1, SL, MVT::i32));
8873 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8874 }
8875 case Intrinsic::amdgcn_perm:
8876 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8877 Op.getOperand(2), Op.getOperand(3));
8878 case Intrinsic::amdgcn_reloc_constant: {
8879 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8880 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8881 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8882 auto *RelocSymbol = cast<GlobalVariable>(
8883 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8884 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8886 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8887 }
8888 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8889 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8890 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8891 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8892 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8893 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8894 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8895 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8896 if (Op.getOperand(4).getValueType() == MVT::i32)
8897 return SDValue();
8898
8899 SDLoc SL(Op);
8900 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8901 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8902 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8903 Op.getOperand(3), IndexKeyi32);
8904 }
8905 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8906 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8907 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8908 if (Op.getOperand(6).getValueType() == MVT::i32)
8909 return SDValue();
8910
8911 SDLoc SL(Op);
8912 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8913 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8914 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8915 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8916 IndexKeyi32, Op.getOperand(7)});
8917 }
8918 case Intrinsic::amdgcn_addrspacecast_nonnull:
8919 return lowerADDRSPACECAST(Op, DAG);
8920 case Intrinsic::amdgcn_readlane:
8921 case Intrinsic::amdgcn_readfirstlane:
8922 case Intrinsic::amdgcn_writelane:
8923 case Intrinsic::amdgcn_permlane16:
8924 case Intrinsic::amdgcn_permlanex16:
8925 case Intrinsic::amdgcn_permlane64:
8926 case Intrinsic::amdgcn_set_inactive:
8927 case Intrinsic::amdgcn_set_inactive_chain_arg:
8928 case Intrinsic::amdgcn_mov_dpp8:
8929 case Intrinsic::amdgcn_update_dpp:
8930 return lowerLaneOp(*this, Op.getNode(), DAG);
8931 default:
8932 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8934 return lowerImage(Op, ImageDimIntr, DAG, false);
8935
8936 return Op;
8937 }
8938}
8939
8940// On targets not supporting constant in soffset field, turn zero to
8941// SGPR_NULL to avoid generating an extra s_mov with zero.
8943 const GCNSubtarget *Subtarget) {
8944 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8945 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8946 return SOffset;
8947}
8948
8949SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8950 SelectionDAG &DAG,
8951 unsigned NewOpcode) const {
8952 SDLoc DL(Op);
8953
8954 SDValue VData = Op.getOperand(2);
8955 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8956 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
8957 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8958 SDValue Ops[] = {
8959 Op.getOperand(0), // Chain
8960 VData, // vdata
8961 Rsrc, // rsrc
8962 DAG.getConstant(0, DL, MVT::i32), // vindex
8963 VOffset, // voffset
8964 SOffset, // soffset
8965 Offset, // offset
8966 Op.getOperand(6), // cachepolicy
8967 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8968 };
8969
8970 auto *M = cast<MemSDNode>(Op);
8971
8972 EVT MemVT = VData.getValueType();
8973 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8974 M->getMemOperand());
8975}
8976
8977SDValue
8978SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8979 unsigned NewOpcode) const {
8980 SDLoc DL(Op);
8981
8982 SDValue VData = Op.getOperand(2);
8983 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8984 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
8985 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8986 SDValue Ops[] = {
8987 Op.getOperand(0), // Chain
8988 VData, // vdata
8989 Rsrc, // rsrc
8990 Op.getOperand(4), // vindex
8991 VOffset, // voffset
8992 SOffset, // soffset
8993 Offset, // offset
8994 Op.getOperand(7), // cachepolicy
8995 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8996 };
8997
8998 auto *M = cast<MemSDNode>(Op);
8999
9000 EVT MemVT = VData.getValueType();
9001 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9002 M->getMemOperand());
9003}
9004
9005SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9006 SelectionDAG &DAG) const {
9007 unsigned IntrID = Op.getConstantOperandVal(1);
9008 SDLoc DL(Op);
9009
9010 switch (IntrID) {
9011 case Intrinsic::amdgcn_ds_ordered_add:
9012 case Intrinsic::amdgcn_ds_ordered_swap: {
9013 MemSDNode *M = cast<MemSDNode>(Op);
9014 SDValue Chain = M->getOperand(0);
9015 SDValue M0 = M->getOperand(2);
9016 SDValue Value = M->getOperand(3);
9017 unsigned IndexOperand = M->getConstantOperandVal(7);
9018 unsigned WaveRelease = M->getConstantOperandVal(8);
9019 unsigned WaveDone = M->getConstantOperandVal(9);
9020
9021 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9022 IndexOperand &= ~0x3f;
9023 unsigned CountDw = 0;
9024
9025 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9026 CountDw = (IndexOperand >> 24) & 0xf;
9027 IndexOperand &= ~(0xf << 24);
9028
9029 if (CountDw < 1 || CountDw > 4) {
9031 "ds_ordered_count: dword count must be between 1 and 4");
9032 }
9033 }
9034
9035 if (IndexOperand)
9036 report_fatal_error("ds_ordered_count: bad index operand");
9037
9038 if (WaveDone && !WaveRelease)
9039 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
9040
9041 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9042 unsigned ShaderType =
9044 unsigned Offset0 = OrderedCountIndex << 2;
9045 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9046
9047 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9048 Offset1 |= (CountDw - 1) << 6;
9049
9050 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9051 Offset1 |= ShaderType << 2;
9052
9053 unsigned Offset = Offset0 | (Offset1 << 8);
9054
9055 SDValue Ops[] = {
9056 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9057 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9058 };
9060 M->getVTList(), Ops, M->getMemoryVT(),
9061 M->getMemOperand());
9062 }
9063 case Intrinsic::amdgcn_raw_buffer_load:
9064 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9065 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9066 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9067 case Intrinsic::amdgcn_raw_buffer_load_format:
9068 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9069 const bool IsFormat =
9070 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9071 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9072
9073 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9074 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9075 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9076 SDValue Ops[] = {
9077 Op.getOperand(0), // Chain
9078 Rsrc, // rsrc
9079 DAG.getConstant(0, DL, MVT::i32), // vindex
9080 VOffset, // voffset
9081 SOffset, // soffset
9082 Offset, // offset
9083 Op.getOperand(5), // cachepolicy, swizzled buffer
9084 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9085 };
9086
9087 auto *M = cast<MemSDNode>(Op);
9088 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9089 }
9090 case Intrinsic::amdgcn_struct_buffer_load:
9091 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9092 case Intrinsic::amdgcn_struct_buffer_load_format:
9093 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9094 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9095 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9096 const bool IsFormat =
9097 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9098 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9099
9100 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9101 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9102 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9103 SDValue Ops[] = {
9104 Op.getOperand(0), // Chain
9105 Rsrc, // rsrc
9106 Op.getOperand(3), // vindex
9107 VOffset, // voffset
9108 SOffset, // soffset
9109 Offset, // offset
9110 Op.getOperand(6), // cachepolicy, swizzled buffer
9111 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9112 };
9113
9114 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9115 }
9116 case Intrinsic::amdgcn_raw_tbuffer_load:
9117 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9118 MemSDNode *M = cast<MemSDNode>(Op);
9119 EVT LoadVT = Op.getValueType();
9120 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9121 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9122 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9123
9124 SDValue Ops[] = {
9125 Op.getOperand(0), // Chain
9126 Rsrc, // rsrc
9127 DAG.getConstant(0, DL, MVT::i32), // vindex
9128 VOffset, // voffset
9129 SOffset, // soffset
9130 Offset, // offset
9131 Op.getOperand(5), // format
9132 Op.getOperand(6), // cachepolicy, swizzled buffer
9133 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9134 };
9135
9136 if (LoadVT.getScalarType() == MVT::f16)
9137 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9138 Ops);
9139 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9140 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9141 DAG);
9142 }
9143 case Intrinsic::amdgcn_struct_tbuffer_load:
9144 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9145 MemSDNode *M = cast<MemSDNode>(Op);
9146 EVT LoadVT = Op.getValueType();
9147 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9148 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9149 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9150
9151 SDValue Ops[] = {
9152 Op.getOperand(0), // Chain
9153 Rsrc, // rsrc
9154 Op.getOperand(3), // vindex
9155 VOffset, // voffset
9156 SOffset, // soffset
9157 Offset, // offset
9158 Op.getOperand(6), // format
9159 Op.getOperand(7), // cachepolicy, swizzled buffer
9160 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9161 };
9162
9163 if (LoadVT.getScalarType() == MVT::f16)
9164 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9165 Ops);
9166 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9167 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9168 DAG);
9169 }
9170 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9171 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9172 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9173 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9174 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9175 return lowerStructBufferAtomicIntrin(Op, DAG,
9177 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9178 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9179 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9180 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9182 return lowerStructBufferAtomicIntrin(Op, DAG,
9184 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9185 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9186 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9187 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9188 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9189 return lowerStructBufferAtomicIntrin(Op, DAG,
9191 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9192 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9193 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9194 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9195 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9196 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9197 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9198 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9199 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9200 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9201 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9202 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9203 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9204 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9205 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9206 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9207 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9208 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9209 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9210 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9211 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9212 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9213 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9214 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9215 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9216 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9217 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9218 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9219 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9220 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9221 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9222 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9223 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9224 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9225 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9226 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9227 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9228 return lowerRawBufferAtomicIntrin(Op, DAG,
9230 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9231 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9232 return lowerStructBufferAtomicIntrin(Op, DAG,
9234 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9235 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9236 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9237 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9239 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9240 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9241 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9242 return lowerStructBufferAtomicIntrin(Op, DAG,
9244 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9245 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9246 return lowerStructBufferAtomicIntrin(Op, DAG,
9248 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9249 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9250 return lowerStructBufferAtomicIntrin(Op, DAG,
9252 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9254 return lowerStructBufferAtomicIntrin(Op, DAG,
9256 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9257 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9258 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9259 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9260 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9261 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9262 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9263 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9264 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9265 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9266 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9267 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9268 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9269 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9270 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9271 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9272 return lowerStructBufferAtomicIntrin(Op, DAG,
9274
9275 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9276 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9277 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9278 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9279 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9280 SDValue Ops[] = {
9281 Op.getOperand(0), // Chain
9282 Op.getOperand(2), // src
9283 Op.getOperand(3), // cmp
9284 Rsrc, // rsrc
9285 DAG.getConstant(0, DL, MVT::i32), // vindex
9286 VOffset, // voffset
9287 SOffset, // soffset
9288 Offset, // offset
9289 Op.getOperand(7), // cachepolicy
9290 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9291 };
9292 EVT VT = Op.getValueType();
9293 auto *M = cast<MemSDNode>(Op);
9294
9296 Op->getVTList(), Ops, VT,
9297 M->getMemOperand());
9298 }
9299 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9300 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9301 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9302 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9303 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9304 SDValue Ops[] = {
9305 Op.getOperand(0), // Chain
9306 Op.getOperand(2), // src
9307 Op.getOperand(3), // cmp
9308 Rsrc, // rsrc
9309 Op.getOperand(5), // vindex
9310 VOffset, // voffset
9311 SOffset, // soffset
9312 Offset, // offset
9313 Op.getOperand(8), // cachepolicy
9314 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9315 };
9316 EVT VT = Op.getValueType();
9317 auto *M = cast<MemSDNode>(Op);
9318
9320 Op->getVTList(), Ops, VT,
9321 M->getMemOperand());
9322 }
9323 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9324 MemSDNode *M = cast<MemSDNode>(Op);
9325 SDValue NodePtr = M->getOperand(2);
9326 SDValue RayExtent = M->getOperand(3);
9327 SDValue RayOrigin = M->getOperand(4);
9328 SDValue RayDir = M->getOperand(5);
9329 SDValue RayInvDir = M->getOperand(6);
9330 SDValue TDescr = M->getOperand(7);
9331
9332 assert(NodePtr.getValueType() == MVT::i32 ||
9333 NodePtr.getValueType() == MVT::i64);
9334 assert(RayDir.getValueType() == MVT::v3f16 ||
9335 RayDir.getValueType() == MVT::v3f32);
9336
9337 if (!Subtarget->hasGFX10_AEncoding()) {
9338 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9339 return SDValue();
9340 }
9341
9342 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9343 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9344 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9345 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9346 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9347 const unsigned NumVDataDwords = 4;
9348 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9349 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9350 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9351 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9352 IsGFX12Plus;
9353 const unsigned BaseOpcodes[2][2] = {
9354 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9355 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9356 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9357 int Opcode;
9358 if (UseNSA) {
9359 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9360 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9361 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9362 : AMDGPU::MIMGEncGfx10NSA,
9363 NumVDataDwords, NumVAddrDwords);
9364 } else {
9365 assert(!IsGFX12Plus);
9366 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9367 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9368 : AMDGPU::MIMGEncGfx10Default,
9369 NumVDataDwords, NumVAddrDwords);
9370 }
9371 assert(Opcode != -1);
9372
9374
9375 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9377 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9378 if (Lanes[0].getValueSizeInBits() == 32) {
9379 for (unsigned I = 0; I < 3; ++I)
9380 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9381 } else {
9382 if (IsAligned) {
9383 Ops.push_back(DAG.getBitcast(
9384 MVT::i32,
9385 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9386 Ops.push_back(Lanes[2]);
9387 } else {
9388 SDValue Elt0 = Ops.pop_back_val();
9389 Ops.push_back(DAG.getBitcast(
9390 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9391 Ops.push_back(DAG.getBitcast(
9392 MVT::i32,
9393 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9394 }
9395 }
9396 };
9397
9398 if (UseNSA && IsGFX11Plus) {
9399 Ops.push_back(NodePtr);
9400 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9401 Ops.push_back(RayOrigin);
9402 if (IsA16) {
9403 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9404 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9405 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9406 for (unsigned I = 0; I < 3; ++I) {
9407 MergedLanes.push_back(DAG.getBitcast(
9408 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9409 {DirLanes[I], InvDirLanes[I]})));
9410 }
9411 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9412 } else {
9413 Ops.push_back(RayDir);
9414 Ops.push_back(RayInvDir);
9415 }
9416 } else {
9417 if (Is64)
9418 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9419 2);
9420 else
9421 Ops.push_back(NodePtr);
9422
9423 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9424 packLanes(RayOrigin, true);
9425 packLanes(RayDir, true);
9426 packLanes(RayInvDir, false);
9427 }
9428
9429 if (!UseNSA) {
9430 // Build a single vector containing all the operands so far prepared.
9431 if (NumVAddrDwords > 12) {
9432 SDValue Undef = DAG.getUNDEF(MVT::i32);
9433 Ops.append(16 - Ops.size(), Undef);
9434 }
9435 assert(Ops.size() >= 8 && Ops.size() <= 12);
9436 SDValue MergedOps =
9437 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9438 Ops.clear();
9439 Ops.push_back(MergedOps);
9440 }
9441
9442 Ops.push_back(TDescr);
9443 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9444 Ops.push_back(M->getChain());
9445
9446 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9447 MachineMemOperand *MemRef = M->getMemOperand();
9448 DAG.setNodeMemRefs(NewNode, {MemRef});
9449 return SDValue(NewNode, 0);
9450 }
9451 case Intrinsic::amdgcn_global_atomic_fmin_num:
9452 case Intrinsic::amdgcn_global_atomic_fmax_num:
9453 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9454 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9455 MemSDNode *M = cast<MemSDNode>(Op);
9456 SDValue Ops[] = {
9457 M->getOperand(0), // Chain
9458 M->getOperand(2), // Ptr
9459 M->getOperand(3) // Value
9460 };
9461 unsigned Opcode = 0;
9462 switch (IntrID) {
9463 case Intrinsic::amdgcn_global_atomic_fmin_num:
9464 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9465 Opcode = ISD::ATOMIC_LOAD_FMIN;
9466 break;
9467 }
9468 case Intrinsic::amdgcn_global_atomic_fmax_num:
9469 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9470 Opcode = ISD::ATOMIC_LOAD_FMAX;
9471 break;
9472 }
9473 default:
9474 llvm_unreachable("unhandled atomic opcode");
9475 }
9476 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9477 Ops, M->getMemOperand());
9478 }
9479 case Intrinsic::amdgcn_s_get_barrier_state:
9480 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9481 SDValue Chain = Op->getOperand(0);
9483 unsigned Opc;
9484
9485 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9486 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9487 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9488 BarID = (BarID >> 4) & 0x3F;
9489 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9490 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9491 Ops.push_back(K);
9492 Ops.push_back(Chain);
9493 } else {
9494 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9495 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9496 SDValue M0Val;
9497 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9498 DAG.getShiftAmountConstant(4, MVT::i32, DL));
9499 M0Val = SDValue(
9500 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9501 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9502 0);
9503 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9504 } else
9505 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9506 }
9507
9508 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9509 return SDValue(NewMI, 0);
9510 }
9511 default:
9512
9513 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9515 return lowerImage(Op, ImageDimIntr, DAG, true);
9516
9517 return SDValue();
9518 }
9519}
9520
9521// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9522// dwordx4 if on SI and handle TFE loads.
9523SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9524 SDVTList VTList,
9525 ArrayRef<SDValue> Ops, EVT MemVT,
9526 MachineMemOperand *MMO,
9527 SelectionDAG &DAG) const {
9528 LLVMContext &C = *DAG.getContext();
9530 EVT VT = VTList.VTs[0];
9531
9532 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9533 bool IsTFE = VTList.NumVTs == 3;
9534 if (IsTFE) {
9535 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9536 unsigned NumOpDWords = NumValueDWords + 1;
9537 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9538 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9539 MachineMemOperand *OpDWordsMMO =
9540 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9541 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9542 OpDWordsVT, OpDWordsMMO, DAG);
9544 DAG.getVectorIdxConstant(NumValueDWords, DL));
9545 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9546 SDValue ValueDWords =
9547 NumValueDWords == 1
9548 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9550 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9551 ZeroIdx);
9552 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9553 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9554 }
9555
9556 if (!Subtarget->hasDwordx3LoadStores() &&
9557 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9558 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9559 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9560 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9561 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9562 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9563 WidenedMemVT, WidenedMMO);
9565 DAG.getVectorIdxConstant(0, DL));
9566 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9567 }
9568
9569 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9570}
9571
9572SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9573 bool ImageStore) const {
9574 EVT StoreVT = VData.getValueType();
9575
9576 // No change for f16 and legal vector D16 types.
9577 if (!StoreVT.isVector())
9578 return VData;
9579
9580 SDLoc DL(VData);
9581 unsigned NumElements = StoreVT.getVectorNumElements();
9582
9583 if (Subtarget->hasUnpackedD16VMem()) {
9584 // We need to unpack the packed data to store.
9585 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9586 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9587
9588 EVT EquivStoreVT =
9589 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9590 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9591 return DAG.UnrollVectorOp(ZExt.getNode());
9592 }
9593
9594 // The sq block of gfx8.1 does not estimate register use correctly for d16
9595 // image store instructions. The data operand is computed as if it were not a
9596 // d16 image instruction.
9597 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9598 // Bitcast to i16
9599 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9600 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9601
9602 // Decompose into scalars
9604 DAG.ExtractVectorElements(IntVData, Elts);
9605
9606 // Group pairs of i16 into v2i16 and bitcast to i32
9607 SmallVector<SDValue, 4> PackedElts;
9608 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9609 SDValue Pair =
9610 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9611 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9612 PackedElts.push_back(IntPair);
9613 }
9614 if ((NumElements % 2) == 1) {
9615 // Handle v3i16
9616 unsigned I = Elts.size() / 2;
9617 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9618 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9619 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9620 PackedElts.push_back(IntPair);
9621 }
9622
9623 // Pad using UNDEF
9624 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9625
9626 // Build final vector
9627 EVT VecVT =
9628 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9629 return DAG.getBuildVector(VecVT, DL, PackedElts);
9630 }
9631
9632 if (NumElements == 3) {
9633 EVT IntStoreVT =
9635 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9636
9637 EVT WidenedStoreVT = EVT::getVectorVT(
9638 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9639 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9640 WidenedStoreVT.getStoreSizeInBits());
9641 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9642 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9643 }
9644
9645 assert(isTypeLegal(StoreVT));
9646 return VData;
9647}
9648
9649SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9650 SelectionDAG &DAG) const {
9651 SDLoc DL(Op);
9652 SDValue Chain = Op.getOperand(0);
9653 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9655
9656 switch (IntrinsicID) {
9657 case Intrinsic::amdgcn_exp_compr: {
9658 if (!Subtarget->hasCompressedExport()) {
9659 DiagnosticInfoUnsupported BadIntrin(
9661 "intrinsic not supported on subtarget", DL.getDebugLoc());
9662 DAG.getContext()->diagnose(BadIntrin);
9663 }
9664 SDValue Src0 = Op.getOperand(4);
9665 SDValue Src1 = Op.getOperand(5);
9666 // Hack around illegal type on SI by directly selecting it.
9667 if (isTypeLegal(Src0.getValueType()))
9668 return SDValue();
9669
9670 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9671 SDValue Undef = DAG.getUNDEF(MVT::f32);
9672 const SDValue Ops[] = {
9673 Op.getOperand(2), // tgt
9674 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9675 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9676 Undef, // src2
9677 Undef, // src3
9678 Op.getOperand(7), // vm
9679 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9680 Op.getOperand(3), // en
9681 Op.getOperand(0) // Chain
9682 };
9683
9684 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9685 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9686 }
9687 case Intrinsic::amdgcn_s_barrier:
9688 case Intrinsic::amdgcn_s_barrier_signal:
9689 case Intrinsic::amdgcn_s_barrier_wait: {
9692 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9693 if (WGSize <= ST.getWavefrontSize()) {
9694 // If the workgroup fits in a wave, remove s_barrier_signal and lower
9695 // s_barrier/s_barrier_wait to wave_barrier.
9696 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9697 return Op.getOperand(0);
9698 else
9699 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9700 MVT::Other, Op.getOperand(0)),
9701 0);
9702 }
9703 }
9704
9705 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9706 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9707 SDValue K =
9709 SDValue BarSignal =
9710 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9711 MVT::Other, K, Op.getOperand(0)),
9712 0);
9713 SDValue BarWait =
9714 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9715 BarSignal.getValue(0)),
9716 0);
9717 return BarWait;
9718 }
9719
9720 return SDValue();
9721 };
9722
9723 case Intrinsic::amdgcn_struct_tbuffer_store:
9724 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9725 SDValue VData = Op.getOperand(2);
9726 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9727 if (IsD16)
9728 VData = handleD16VData(VData, DAG);
9729 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9730 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9731 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9732 SDValue Ops[] = {
9733 Chain,
9734 VData, // vdata
9735 Rsrc, // rsrc
9736 Op.getOperand(4), // vindex
9737 VOffset, // voffset
9738 SOffset, // soffset
9739 Offset, // offset
9740 Op.getOperand(7), // format
9741 Op.getOperand(8), // cachepolicy, swizzled buffer
9742 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9743 };
9744 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9746 MemSDNode *M = cast<MemSDNode>(Op);
9747 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9748 M->getMemoryVT(), M->getMemOperand());
9749 }
9750
9751 case Intrinsic::amdgcn_raw_tbuffer_store:
9752 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9753 SDValue VData = Op.getOperand(2);
9754 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9755 if (IsD16)
9756 VData = handleD16VData(VData, DAG);
9757 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9758 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9759 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9760 SDValue Ops[] = {
9761 Chain,
9762 VData, // vdata
9763 Rsrc, // rsrc
9764 DAG.getConstant(0, DL, MVT::i32), // vindex
9765 VOffset, // voffset
9766 SOffset, // soffset
9767 Offset, // offset
9768 Op.getOperand(6), // format
9769 Op.getOperand(7), // cachepolicy, swizzled buffer
9770 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9771 };
9772 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9774 MemSDNode *M = cast<MemSDNode>(Op);
9775 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9776 M->getMemoryVT(), M->getMemOperand());
9777 }
9778
9779 case Intrinsic::amdgcn_raw_buffer_store:
9780 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9781 case Intrinsic::amdgcn_raw_buffer_store_format:
9782 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9783 const bool IsFormat =
9784 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9785 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9786
9787 SDValue VData = Op.getOperand(2);
9788 EVT VDataVT = VData.getValueType();
9789 EVT EltType = VDataVT.getScalarType();
9790 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9791 if (IsD16) {
9792 VData = handleD16VData(VData, DAG);
9793 VDataVT = VData.getValueType();
9794 }
9795
9796 if (!isTypeLegal(VDataVT)) {
9797 VData =
9798 DAG.getNode(ISD::BITCAST, DL,
9799 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9800 }
9801
9802 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9803 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9804 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9805 SDValue Ops[] = {
9806 Chain,
9807 VData,
9808 Rsrc,
9809 DAG.getConstant(0, DL, MVT::i32), // vindex
9810 VOffset, // voffset
9811 SOffset, // soffset
9812 Offset, // offset
9813 Op.getOperand(6), // cachepolicy, swizzled buffer
9814 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9815 };
9816 unsigned Opc =
9818 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9819 MemSDNode *M = cast<MemSDNode>(Op);
9820
9821 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9822 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9823 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9824
9825 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9826 M->getMemoryVT(), M->getMemOperand());
9827 }
9828
9829 case Intrinsic::amdgcn_struct_buffer_store:
9830 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9831 case Intrinsic::amdgcn_struct_buffer_store_format:
9832 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9833 const bool IsFormat =
9834 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9835 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9836
9837 SDValue VData = Op.getOperand(2);
9838 EVT VDataVT = VData.getValueType();
9839 EVT EltType = VDataVT.getScalarType();
9840 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9841
9842 if (IsD16) {
9843 VData = handleD16VData(VData, DAG);
9844 VDataVT = VData.getValueType();
9845 }
9846
9847 if (!isTypeLegal(VDataVT)) {
9848 VData =
9849 DAG.getNode(ISD::BITCAST, DL,
9850 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9851 }
9852
9853 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9854 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9855 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9856 SDValue Ops[] = {
9857 Chain,
9858 VData,
9859 Rsrc,
9860 Op.getOperand(4), // vindex
9861 VOffset, // voffset
9862 SOffset, // soffset
9863 Offset, // offset
9864 Op.getOperand(7), // cachepolicy, swizzled buffer
9865 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9866 };
9867 unsigned Opc =
9869 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9870 MemSDNode *M = cast<MemSDNode>(Op);
9871
9872 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9873 EVT VDataType = VData.getValueType().getScalarType();
9874 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9875 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9876
9877 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9878 M->getMemoryVT(), M->getMemOperand());
9879 }
9880 case Intrinsic::amdgcn_raw_buffer_load_lds:
9881 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9882 case Intrinsic::amdgcn_struct_buffer_load_lds:
9883 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9884 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9885 unsigned Opc;
9886 bool HasVIndex =
9887 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9888 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9889 unsigned OpOffset = HasVIndex ? 1 : 0;
9890 SDValue VOffset = Op.getOperand(5 + OpOffset);
9891 bool HasVOffset = !isNullConstant(VOffset);
9892 unsigned Size = Op->getConstantOperandVal(4);
9893
9894 switch (Size) {
9895 default:
9896 return SDValue();
9897 case 1:
9898 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9899 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9900 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9901 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9902 break;
9903 case 2:
9904 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9905 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9906 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9907 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9908 break;
9909 case 4:
9910 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9911 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9912 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9913 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9914 break;
9915 case 12:
9916 if (!Subtarget->hasLDSLoadB96_B128())
9917 return SDValue();
9918 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9919 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9920 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9921 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9922 break;
9923 case 16:
9924 if (!Subtarget->hasLDSLoadB96_B128())
9925 return SDValue();
9926 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9927 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9928 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9929 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9930 break;
9931 }
9932
9933 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9934
9936
9937 if (HasVIndex && HasVOffset)
9938 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9939 {Op.getOperand(5), // VIndex
9940 VOffset}));
9941 else if (HasVIndex)
9942 Ops.push_back(Op.getOperand(5));
9943 else if (HasVOffset)
9944 Ops.push_back(VOffset);
9945
9946 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9947 Ops.push_back(Rsrc);
9948 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9949 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9950 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9951 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9953 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
9954 DL, MVT::i8)); // cpol
9956 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
9957 ? 1
9958 : 0,
9959 DL, MVT::i8)); // swz
9960 Ops.push_back(M0Val.getValue(0)); // Chain
9961 Ops.push_back(M0Val.getValue(1)); // Glue
9962
9963 auto *M = cast<MemSDNode>(Op);
9964 MachineMemOperand *LoadMMO = M->getMemOperand();
9965 // Don't set the offset value here because the pointer points to the base of
9966 // the buffer.
9967 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9968
9969 MachinePointerInfo StorePtrI = LoadPtrI;
9970 LoadPtrI.V = PoisonValue::get(
9974
9975 auto F = LoadMMO->getFlags() &
9977 LoadMMO =
9979 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9980
9982 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9983 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9984
9985 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9986 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9987
9988 return SDValue(Load, 0);
9989 }
9990 case Intrinsic::amdgcn_global_load_lds: {
9991 unsigned Opc;
9992 unsigned Size = Op->getConstantOperandVal(4);
9993 switch (Size) {
9994 default:
9995 return SDValue();
9996 case 1:
9997 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9998 break;
9999 case 2:
10000 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10001 break;
10002 case 4:
10003 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10004 break;
10005 case 12:
10006 if (!Subtarget->hasLDSLoadB96_B128())
10007 return SDValue();
10008 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10009 break;
10010 case 16:
10011 if (!Subtarget->hasLDSLoadB96_B128())
10012 return SDValue();
10013 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10014 break;
10015 }
10016
10017 auto *M = cast<MemSDNode>(Op);
10018 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10019
10021
10022 SDValue Addr = Op.getOperand(2); // Global ptr
10023 SDValue VOffset;
10024 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10025 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10026 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10027 SDValue LHS = Addr.getOperand(0);
10028 SDValue RHS = Addr.getOperand(1);
10029
10030 if (LHS->isDivergent())
10031 std::swap(LHS, RHS);
10032
10033 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10034 RHS.getOperand(0).getValueType() == MVT::i32) {
10035 // add (i64 sgpr), (zero_extend (i32 vgpr))
10036 Addr = LHS;
10037 VOffset = RHS.getOperand(0);
10038 }
10039 }
10040
10041 Ops.push_back(Addr);
10042 if (!Addr->isDivergent()) {
10043 Opc = AMDGPU::getGlobalSaddrOp(Opc);
10044 if (!VOffset)
10045 VOffset =
10046 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10047 DAG.getTargetConstant(0, DL, MVT::i32)),
10048 0);
10049 Ops.push_back(VOffset);
10050 }
10051
10052 Ops.push_back(Op.getOperand(5)); // Offset
10053 Ops.push_back(Op.getOperand(6)); // CPol
10054 Ops.push_back(M0Val.getValue(0)); // Chain
10055 Ops.push_back(M0Val.getValue(1)); // Glue
10056
10057 MachineMemOperand *LoadMMO = M->getMemOperand();
10058 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10059 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10060 MachinePointerInfo StorePtrI = LoadPtrI;
10061 LoadPtrI.V = PoisonValue::get(
10065 auto F = LoadMMO->getFlags() &
10067 LoadMMO =
10069 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10071 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10072 LoadMMO->getAAInfo());
10073
10074 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10075 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10076
10077 return SDValue(Load, 0);
10078 }
10079 case Intrinsic::amdgcn_end_cf:
10080 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10081 Op->getOperand(2), Chain),
10082 0);
10083 case Intrinsic::amdgcn_s_barrier_init:
10084 case Intrinsic::amdgcn_s_barrier_signal_var: {
10085 // these two intrinsics have two operands: barrier pointer and member count
10086 SDValue Chain = Op->getOperand(0);
10088 SDValue BarOp = Op->getOperand(2);
10089 SDValue CntOp = Op->getOperand(3);
10090 SDValue M0Val;
10091 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10092 ? AMDGPU::S_BARRIER_INIT_M0
10093 : AMDGPU::S_BARRIER_SIGNAL_M0;
10094 // extract the BarrierID from bits 4-9 of BarOp
10095 SDValue BarID;
10096 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10097 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10098 BarID =
10099 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10100 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10101 0);
10102 // Member count should be put into M0[ShAmt:+6]
10103 // Barrier ID should be put into M0[5:0]
10104 M0Val =
10105 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10106 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10107 0);
10108 constexpr unsigned ShAmt = 16;
10109 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10110 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10111
10112 M0Val = SDValue(
10113 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10114
10115 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10116
10117 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10118 return SDValue(NewMI, 0);
10119 }
10120 case Intrinsic::amdgcn_s_barrier_join: {
10121 // these three intrinsics have one operand: barrier pointer
10122 SDValue Chain = Op->getOperand(0);
10124 SDValue BarOp = Op->getOperand(2);
10125 unsigned Opc;
10126
10127 if (isa<ConstantSDNode>(BarOp)) {
10128 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10129 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10130
10131 // extract the BarrierID from bits 4-9 of the immediate
10132 unsigned BarID = (BarVal >> 4) & 0x3F;
10133 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10134 Ops.push_back(K);
10135 Ops.push_back(Chain);
10136 } else {
10137 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10138
10139 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10140 SDValue M0Val;
10141 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10142 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10143 M0Val =
10144 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10145 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10146 0);
10147 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10148 }
10149
10150 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10151 return SDValue(NewMI, 0);
10152 }
10153 case Intrinsic::amdgcn_s_prefetch_data: {
10154 // For non-global address space preserve the chain and remove the call.
10155 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10156 return Op.getOperand(0);
10157 return Op;
10158 }
10159 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10160 SDValue Ops[] = {
10161 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10162 Op.getOperand(3), // offset
10163 Op.getOperand(4), // length
10164 };
10165
10166 MemSDNode *M = cast<MemSDNode>(Op);
10168 Op->getVTList(), Ops, M->getMemoryVT(),
10169 M->getMemOperand());
10170 }
10171 default: {
10172 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10174 return lowerImage(Op, ImageDimIntr, DAG, true);
10175
10176 return Op;
10177 }
10178 }
10179}
10180
10181// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10182// offset (the offset that is included in bounds checking and swizzling, to be
10183// split between the instruction's voffset and immoffset fields) and soffset
10184// (the offset that is excluded from bounds checking and swizzling, to go in
10185// the instruction's soffset field). This function takes the first kind of
10186// offset and figures out how to split it between voffset and immoffset.
10187std::pair<SDValue, SDValue>
10188SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10189 SDLoc DL(Offset);
10190 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10191 SDValue N0 = Offset;
10192 ConstantSDNode *C1 = nullptr;
10193
10194 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10195 N0 = SDValue();
10196 else if (DAG.isBaseWithConstantOffset(N0)) {
10197 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10198 N0 = N0.getOperand(0);
10199 }
10200
10201 if (C1) {
10202 unsigned ImmOffset = C1->getZExtValue();
10203 // If the immediate value is too big for the immoffset field, put only bits
10204 // that would normally fit in the immoffset field. The remaining value that
10205 // is copied/added for the voffset field is a large power of 2, and it
10206 // stands more chance of being CSEd with the copy/add for another similar
10207 // load/store.
10208 // However, do not do that rounding down if that is a negative
10209 // number, as it appears to be illegal to have a negative offset in the
10210 // vgpr, even if adding the immediate offset makes it positive.
10211 unsigned Overflow = ImmOffset & ~MaxImm;
10212 ImmOffset -= Overflow;
10213 if ((int32_t)Overflow < 0) {
10214 Overflow += ImmOffset;
10215 ImmOffset = 0;
10216 }
10217 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10218 if (Overflow) {
10219 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10220 if (!N0)
10221 N0 = OverflowVal;
10222 else {
10223 SDValue Ops[] = {N0, OverflowVal};
10224 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10225 }
10226 }
10227 }
10228 if (!N0)
10229 N0 = DAG.getConstant(0, DL, MVT::i32);
10230 if (!C1)
10231 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10232 return {N0, SDValue(C1, 0)};
10233}
10234
10235// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10236// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10237// pointed to by Offsets.
10238void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10239 SelectionDAG &DAG, SDValue *Offsets,
10240 Align Alignment) const {
10242 SDLoc DL(CombinedOffset);
10243 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10244 uint32_t Imm = C->getZExtValue();
10245 uint32_t SOffset, ImmOffset;
10246 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10247 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10248 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10249 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10250 return;
10251 }
10252 }
10253 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10254 SDValue N0 = CombinedOffset.getOperand(0);
10255 SDValue N1 = CombinedOffset.getOperand(1);
10256 uint32_t SOffset, ImmOffset;
10257 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10258 if (Offset >= 0 &&
10259 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10260 Offsets[0] = N0;
10261 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10262 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10263 return;
10264 }
10265 }
10266
10267 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10268 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10269 : DAG.getConstant(0, DL, MVT::i32);
10270
10271 Offsets[0] = CombinedOffset;
10272 Offsets[1] = SOffsetZero;
10273 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10274}
10275
10276SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10277 SelectionDAG &DAG) const {
10278 if (!MaybePointer.getValueType().isScalarInteger())
10279 return MaybePointer;
10280
10281 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10282 return Rsrc;
10283}
10284
10285// Wrap a global or flat pointer into a buffer intrinsic using the flags
10286// specified in the intrinsic.
10287SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10288 SelectionDAG &DAG) const {
10289 SDLoc Loc(Op);
10290
10291 SDValue Pointer = Op->getOperand(1);
10292 SDValue Stride = Op->getOperand(2);
10293 SDValue NumRecords = Op->getOperand(3);
10294 SDValue Flags = Op->getOperand(4);
10295
10296 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10297 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10298 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10299 std::optional<uint32_t> ConstStride = std::nullopt;
10300 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10301 ConstStride = ConstNode->getZExtValue();
10302
10303 SDValue NewHighHalf = Masked;
10304 if (!ConstStride || *ConstStride != 0) {
10305 SDValue ShiftedStride;
10306 if (ConstStride) {
10307 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10308 } else {
10309 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10310 ShiftedStride =
10311 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10312 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10313 }
10314 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10315 }
10316
10317 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10318 NewHighHalf, NumRecords, Flags);
10319 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10320 return RsrcPtr;
10321}
10322
10323// Handle 8 bit and 16 bit buffer loads
10324SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10325 EVT LoadVT, SDLoc DL,
10327 MachineMemOperand *MMO,
10328 bool IsTFE) const {
10329 EVT IntVT = LoadVT.changeTypeToInteger();
10330
10331 if (IsTFE) {
10332 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10336 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10337 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10338 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10340 DAG.getConstant(1, DL, MVT::i32));
10341 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10342 DAG.getConstant(0, DL, MVT::i32));
10343 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10344 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10345 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10346 }
10347
10348 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10351
10352 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10353 SDValue BufferLoad =
10354 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10355 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10356 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10357
10358 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10359}
10360
10361// Handle 8 bit and 16 bit buffer stores
10362SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10363 EVT VDataType, SDLoc DL,
10364 SDValue Ops[],
10365 MemSDNode *M) const {
10366 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10367 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10368
10369 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10370 Ops[1] = BufferStoreExt;
10371 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10373 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10374 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10375 M->getMemOperand());
10376}
10377
10379 SDValue Op, const SDLoc &SL, EVT VT) {
10380 if (VT.bitsLT(Op.getValueType()))
10381 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10382
10383 switch (ExtType) {
10384 case ISD::SEXTLOAD:
10385 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10386 case ISD::ZEXTLOAD:
10387 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10388 case ISD::EXTLOAD:
10389 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10390 case ISD::NON_EXTLOAD:
10391 return Op;
10392 }
10393
10394 llvm_unreachable("invalid ext type");
10395}
10396
10397// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10398// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10399SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10400 DAGCombinerInfo &DCI) const {
10401 SelectionDAG &DAG = DCI.DAG;
10402 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10403 return SDValue();
10404
10405 // FIXME: Constant loads should all be marked invariant.
10406 unsigned AS = Ld->getAddressSpace();
10407 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10409 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10410 return SDValue();
10411
10412 // Don't do this early, since it may interfere with adjacent load merging for
10413 // illegal types. We can avoid losing alignment information for exotic types
10414 // pre-legalize.
10415 EVT MemVT = Ld->getMemoryVT();
10416 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10417 MemVT.getSizeInBits() >= 32)
10418 return SDValue();
10419
10420 SDLoc SL(Ld);
10421
10422 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10423 "unexpected vector extload");
10424
10425 // TODO: Drop only high part of range.
10426 SDValue Ptr = Ld->getBasePtr();
10427 SDValue NewLoad = DAG.getLoad(
10428 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10429 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10430 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10431 nullptr); // Drop ranges
10432
10433 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10434 if (MemVT.isFloatingPoint()) {
10436 "unexpected fp extload");
10437 TruncVT = MemVT.changeTypeToInteger();
10438 }
10439
10440 SDValue Cvt = NewLoad;
10441 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10442 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10443 DAG.getValueType(TruncVT));
10444 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10446 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10447 } else {
10449 }
10450
10451 EVT VT = Ld->getValueType(0);
10452 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10453
10454 DCI.AddToWorklist(Cvt.getNode());
10455
10456 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10457 // the appropriate extension from the 32-bit load.
10458 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10459 DCI.AddToWorklist(Cvt.getNode());
10460
10461 // Handle conversion back to floating point if necessary.
10462 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10463
10464 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10465}
10466
10468 const SIMachineFunctionInfo &Info) {
10469 // TODO: Should check if the address can definitely not access stack.
10470 if (Info.isEntryFunction())
10471 return Info.getUserSGPRInfo().hasFlatScratchInit();
10472 return true;
10473}
10474
10475SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10476 SDLoc DL(Op);
10477 LoadSDNode *Load = cast<LoadSDNode>(Op);
10478 ISD::LoadExtType ExtType = Load->getExtensionType();
10479 EVT MemVT = Load->getMemoryVT();
10480 MachineMemOperand *MMO = Load->getMemOperand();
10481
10482 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10483 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10484 return SDValue();
10485
10486 // FIXME: Copied from PPC
10487 // First, load into 32 bits, then truncate to 1 bit.
10488
10489 SDValue Chain = Load->getChain();
10490 SDValue BasePtr = Load->getBasePtr();
10491
10492 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10493
10494 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10495 RealMemVT, MMO);
10496
10497 if (!MemVT.isVector()) {
10498 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10499 NewLD.getValue(1)};
10500
10501 return DAG.getMergeValues(Ops, DL);
10502 }
10503
10505 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10506 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10507 DAG.getConstant(I, DL, MVT::i32));
10508
10509 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10510 }
10511
10512 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10513
10514 return DAG.getMergeValues(Ops, DL);
10515 }
10516
10517 if (!MemVT.isVector())
10518 return SDValue();
10519
10520 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10521 "Custom lowering for non-i32 vectors hasn't been implemented.");
10522
10523 Align Alignment = Load->getAlign();
10524 unsigned AS = Load->getAddressSpace();
10525 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10526 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10527 return SplitVectorLoad(Op, DAG);
10528 }
10529
10532 // If there is a possibility that flat instruction access scratch memory
10533 // then we need to use the same legalization rules we use for private.
10534 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10536 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10539
10540 unsigned NumElements = MemVT.getVectorNumElements();
10541
10542 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10544 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10545 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10547 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10548 Alignment >= Align(4) && NumElements < 32) {
10549 if (MemVT.isPow2VectorType() ||
10550 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10551 return SDValue();
10552 return WidenOrSplitVectorLoad(Op, DAG);
10553 }
10554 // Non-uniform loads will be selected to MUBUF instructions, so they
10555 // have the same legalization requirements as global and private
10556 // loads.
10557 //
10558 }
10559 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10562 if (NumElements > 4)
10563 return SplitVectorLoad(Op, DAG);
10564 // v3 loads not supported on SI.
10565 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10566 return WidenOrSplitVectorLoad(Op, DAG);
10567
10568 // v3 and v4 loads are supported for private and global memory.
10569 return SDValue();
10570 }
10571 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10572 // Depending on the setting of the private_element_size field in the
10573 // resource descriptor, we can only make private accesses up to a certain
10574 // size.
10575 switch (Subtarget->getMaxPrivateElementSize()) {
10576 case 4: {
10577 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
10578 return DAG.getMergeValues({Op0, Op1}, DL);
10579 }
10580 case 8:
10581 if (NumElements > 2)
10582 return SplitVectorLoad(Op, DAG);
10583 return SDValue();
10584 case 16:
10585 // Same as global/flat
10586 if (NumElements > 4)
10587 return SplitVectorLoad(Op, DAG);
10588 // v3 loads not supported on SI.
10589 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10590 return WidenOrSplitVectorLoad(Op, DAG);
10591
10592 return SDValue();
10593 default:
10594 llvm_unreachable("unsupported private_element_size");
10595 }
10596 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10597 unsigned Fast = 0;
10598 auto Flags = Load->getMemOperand()->getFlags();
10600 Load->getAlign(), Flags, &Fast) &&
10601 Fast > 1)
10602 return SDValue();
10603
10604 if (MemVT.isVector())
10605 return SplitVectorLoad(Op, DAG);
10606 }
10607
10609 MemVT, *Load->getMemOperand())) {
10610 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
10611 return DAG.getMergeValues({Op0, Op1}, DL);
10612 }
10613
10614 return SDValue();
10615}
10616
10617SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10618 EVT VT = Op.getValueType();
10619 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10620 VT.getSizeInBits() == 512)
10621 return splitTernaryVectorOp(Op, DAG);
10622
10623 assert(VT.getSizeInBits() == 64);
10624
10625 SDLoc DL(Op);
10626 SDValue Cond = Op.getOperand(0);
10627
10628 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10629 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10630
10631 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10632 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10633
10634 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10635 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10636
10637 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10638
10639 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10640 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10641
10642 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10643
10644 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10645 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10646}
10647
10648// Catch division cases where we can use shortcuts with rcp and rsq
10649// instructions.
10650SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10651 SelectionDAG &DAG) const {
10652 SDLoc SL(Op);
10653 SDValue LHS = Op.getOperand(0);
10654 SDValue RHS = Op.getOperand(1);
10655 EVT VT = Op.getValueType();
10656 const SDNodeFlags Flags = Op->getFlags();
10657
10658 bool AllowInaccurateRcp =
10659 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10660
10661 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10662 // Without !fpmath accuracy information, we can't do more because we don't
10663 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10664 // f16 is always accurate enough
10665 if (!AllowInaccurateRcp && VT != MVT::f16)
10666 return SDValue();
10667
10668 if (CLHS->isExactlyValue(1.0)) {
10669 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10670 // the CI documentation has a worst case error of 1 ulp.
10671 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10672 // use it as long as we aren't trying to use denormals.
10673 //
10674 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10675
10676 // 1.0 / sqrt(x) -> rsq(x)
10677
10678 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10679 // error seems really high at 2^29 ULP.
10680 // 1.0 / x -> rcp(x)
10681 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10682 }
10683
10684 // Same as for 1.0, but expand the sign out of the constant.
10685 if (CLHS->isExactlyValue(-1.0)) {
10686 // -1.0 / x -> rcp (fneg x)
10687 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10688 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10689 }
10690 }
10691
10692 // For f16 require afn or arcp.
10693 // For f32 require afn.
10694 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10695 return SDValue();
10696
10697 // Turn into multiply by the reciprocal.
10698 // x / y -> x * (1.0 / y)
10699 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10700 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10701}
10702
10703SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10704 SelectionDAG &DAG) const {
10705 SDLoc SL(Op);
10706 SDValue X = Op.getOperand(0);
10707 SDValue Y = Op.getOperand(1);
10708 EVT VT = Op.getValueType();
10709 const SDNodeFlags Flags = Op->getFlags();
10710
10711 bool AllowInaccurateDiv =
10712 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10713 if (!AllowInaccurateDiv)
10714 return SDValue();
10715
10716 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10717 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10718
10719 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10720 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10721
10722 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10723 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10724 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10725 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10726 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10727 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10728}
10729
10730static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10731 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10732 SDNodeFlags Flags) {
10733 if (GlueChain->getNumValues() <= 1) {
10734 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10735 }
10736
10737 assert(GlueChain->getNumValues() == 3);
10738
10739 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10740 switch (Opcode) {
10741 default:
10742 llvm_unreachable("no chain equivalent for opcode");
10743 case ISD::FMUL:
10744 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10745 break;
10746 }
10747
10748 return DAG.getNode(Opcode, SL, VTList,
10749 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10750 Flags);
10751}
10752
10753static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10754 EVT VT, SDValue A, SDValue B, SDValue C,
10755 SDValue GlueChain, SDNodeFlags Flags) {
10756 if (GlueChain->getNumValues() <= 1) {
10757 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10758 }
10759
10760 assert(GlueChain->getNumValues() == 3);
10761
10762 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10763 switch (Opcode) {
10764 default:
10765 llvm_unreachable("no chain equivalent for opcode");
10766 case ISD::FMA:
10767 Opcode = AMDGPUISD::FMA_W_CHAIN;
10768 break;
10769 }
10770
10771 return DAG.getNode(Opcode, SL, VTList,
10772 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10773 Flags);
10774}
10775
10776SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10777 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10778 return FastLowered;
10779
10780 SDLoc SL(Op);
10781 SDValue LHS = Op.getOperand(0);
10782 SDValue RHS = Op.getOperand(1);
10783
10784 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10785 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10786 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10787 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10788 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10789 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10790 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10791 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10792 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10793 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10794 // q16.u = opx(V_CVT_F16_F32, q32.u);
10795 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10796
10797 // We will use ISD::FMA on targets that don't support ISD::FMAD.
10798 unsigned FMADOpCode =
10800
10801 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10802 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10803 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10804 SDValue Rcp =
10805 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10806 SDValue Quot =
10807 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10808 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10809 Op->getFlags());
10810 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10811 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10812 Op->getFlags());
10813 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10814 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10815 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10816 DAG.getConstant(0xff800000, SL, MVT::i32));
10817 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10818 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10819 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10820 DAG.getTargetConstant(0, SL, MVT::i32));
10821 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10822 Op->getFlags());
10823}
10824
10825// Faster 2.5 ULP division that does not support denormals.
10826SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10827 SDNodeFlags Flags = Op->getFlags();
10828 SDLoc SL(Op);
10829 SDValue LHS = Op.getOperand(1);
10830 SDValue RHS = Op.getOperand(2);
10831
10832 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10833
10834 const APFloat K0Val(0x1p+96f);
10835 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10836
10837 const APFloat K1Val(0x1p-32f);
10838 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10839
10840 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10841
10842 EVT SetCCVT =
10843 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10844
10845 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10846
10847 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10848
10849 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10850
10851 // rcp does not support denormals.
10852 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10853
10854 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10855
10856 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10857}
10858
10859// Returns immediate value for setting the F32 denorm mode when using the
10860// S_DENORM_MODE instruction.
10862 const SIMachineFunctionInfo *Info,
10863 const GCNSubtarget *ST) {
10864 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10865 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10866 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10867 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10868}
10869
10870SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10871 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10872 return FastLowered;
10873
10874 // The selection matcher assumes anything with a chain selecting to a
10875 // mayRaiseFPException machine instruction. Since we're introducing a chain
10876 // here, we need to explicitly report nofpexcept for the regular fdiv
10877 // lowering.
10878 SDNodeFlags Flags = Op->getFlags();
10879 Flags.setNoFPExcept(true);
10880
10881 SDLoc SL(Op);
10882 SDValue LHS = Op.getOperand(0);
10883 SDValue RHS = Op.getOperand(1);
10884
10885 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10886
10887 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10888
10889 SDValue DenominatorScaled =
10890 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
10891 SDValue NumeratorScaled =
10892 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
10893
10894 // Denominator is scaled to not be denormal, so using rcp is ok.
10895 SDValue ApproxRcp =
10896 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
10897 SDValue NegDivScale0 =
10898 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
10899
10900 using namespace AMDGPU::Hwreg;
10901 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10902 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10903
10904 const MachineFunction &MF = DAG.getMachineFunction();
10906 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10907
10908 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10909 const bool HasDynamicDenormals =
10910 (DenormMode.Input == DenormalMode::Dynamic) ||
10911 (DenormMode.Output == DenormalMode::Dynamic);
10912
10913 SDValue SavedDenormMode;
10914
10915 if (!PreservesDenormals) {
10916 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10917 // lowering. The chain dependence is insufficient, and we need glue. We do
10918 // not need the glue variants in a strictfp function.
10919
10920 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10921
10922 SDValue Glue = DAG.getEntryNode();
10923 if (HasDynamicDenormals) {
10924 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10925 DAG.getVTList(MVT::i32, MVT::Glue),
10926 {BitField, Glue});
10927 SavedDenormMode = SDValue(GetReg, 0);
10928
10929 Glue = DAG.getMergeValues(
10930 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10931 }
10932
10933 SDNode *EnableDenorm;
10934 if (Subtarget->hasDenormModeInst()) {
10935 const SDValue EnableDenormValue =
10936 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10937
10938 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10939 EnableDenormValue)
10940 .getNode();
10941 } else {
10942 const SDValue EnableDenormValue =
10943 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
10944 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10945 {EnableDenormValue, BitField, Glue});
10946 }
10947
10948 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
10949 SDValue(EnableDenorm, 1)};
10950
10951 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10952 }
10953
10954 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10955 ApproxRcp, One, NegDivScale0, Flags);
10956
10957 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10958 ApproxRcp, Fma0, Flags);
10959
10960 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
10961 Fma1, Flags);
10962
10963 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10964 NumeratorScaled, Mul, Flags);
10965
10966 SDValue Fma3 =
10967 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
10968
10969 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10970 NumeratorScaled, Fma3, Flags);
10971
10972 if (!PreservesDenormals) {
10973 SDNode *DisableDenorm;
10974 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10975 const SDValue DisableDenormValue = getSPDenormModeValue(
10976 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10977
10978 DisableDenorm =
10979 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1),
10980 DisableDenormValue, Fma4.getValue(2))
10981 .getNode();
10982 } else {
10983 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10984 const SDValue DisableDenormValue =
10985 HasDynamicDenormals
10986 ? SavedDenormMode
10987 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10988
10989 DisableDenorm = DAG.getMachineNode(
10990 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10991 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10992 }
10993
10994 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10995 SDValue(DisableDenorm, 0), DAG.getRoot());
10996 DAG.setRoot(OutputChain);
10997 }
10998
10999 SDValue Scale = NumeratorScaled.getValue(1);
11000 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11001 {Fma4, Fma1, Fma3, Scale}, Flags);
11002
11003 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11004}
11005
11006SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11007 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11008 return FastLowered;
11009
11010 SDLoc SL(Op);
11011 SDValue X = Op.getOperand(0);
11012 SDValue Y = Op.getOperand(1);
11013
11014 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11015
11016 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11017
11018 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11019
11020 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11021
11022 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11023
11024 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11025
11026 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11027
11028 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11029
11030 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11031
11032 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11033 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11034
11035 SDValue Fma4 =
11036 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11037
11038 SDValue Scale;
11039
11040 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11041 // Workaround a hardware bug on SI where the condition output from div_scale
11042 // is not usable.
11043
11044 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11045
11046 // Figure out if the scale to use for div_fmas.
11047 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11048 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11049 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11050 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11051
11052 SDValue NumHi =
11053 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11054 SDValue DenHi =
11055 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11056
11057 SDValue Scale0Hi =
11058 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11059 SDValue Scale1Hi =
11060 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11061
11062 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11063 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11064 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11065 } else {
11066 Scale = DivScale1.getValue(1);
11067 }
11068
11069 SDValue Fmas =
11070 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11071
11072 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11073}
11074
11075SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11076 EVT VT = Op.getValueType();
11077
11078 if (VT == MVT::f32)
11079 return LowerFDIV32(Op, DAG);
11080
11081 if (VT == MVT::f64)
11082 return LowerFDIV64(Op, DAG);
11083
11084 if (VT == MVT::f16)
11085 return LowerFDIV16(Op, DAG);
11086
11087 llvm_unreachable("Unexpected type for fdiv");
11088}
11089
11090SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11091 SDLoc dl(Op);
11092 SDValue Val = Op.getOperand(0);
11093 EVT VT = Val.getValueType();
11094 EVT ResultExpVT = Op->getValueType(1);
11095 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11096
11097 SDValue Mant = DAG.getNode(
11099 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11100
11101 SDValue Exp = DAG.getNode(
11102 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11103 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11104
11105 if (Subtarget->hasFractBug()) {
11106 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11107 SDValue Inf =
11109
11110 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11111 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11112 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11113 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11114 }
11115
11116 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11117 return DAG.getMergeValues({Mant, CastExp}, dl);
11118}
11119
11120SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11121 SDLoc DL(Op);
11122 StoreSDNode *Store = cast<StoreSDNode>(Op);
11123 EVT VT = Store->getMemoryVT();
11124
11125 if (VT == MVT::i1) {
11126 return DAG.getTruncStore(
11127 Store->getChain(), DL,
11128 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11129 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11130 }
11131
11132 assert(VT.isVector() &&
11133 Store->getValue().getValueType().getScalarType() == MVT::i32);
11134
11135 unsigned AS = Store->getAddressSpace();
11136 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11137 Store->getAlign().value() < VT.getStoreSize() &&
11138 VT.getSizeInBits() > 32) {
11139 return SplitVectorStore(Op, DAG);
11140 }
11141
11144 // If there is a possibility that flat instruction access scratch memory
11145 // then we need to use the same legalization rules we use for private.
11146 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11148 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11151
11152 unsigned NumElements = VT.getVectorNumElements();
11154 if (NumElements > 4)
11155 return SplitVectorStore(Op, DAG);
11156 // v3 stores not supported on SI.
11157 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11158 return SplitVectorStore(Op, DAG);
11159
11161 VT, *Store->getMemOperand()))
11162 return expandUnalignedStore(Store, DAG);
11163
11164 return SDValue();
11165 }
11166 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11167 switch (Subtarget->getMaxPrivateElementSize()) {
11168 case 4:
11169 return scalarizeVectorStore(Store, DAG);
11170 case 8:
11171 if (NumElements > 2)
11172 return SplitVectorStore(Op, DAG);
11173 return SDValue();
11174 case 16:
11175 if (NumElements > 4 ||
11176 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11177 return SplitVectorStore(Op, DAG);
11178 return SDValue();
11179 default:
11180 llvm_unreachable("unsupported private_element_size");
11181 }
11182 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11183 unsigned Fast = 0;
11184 auto Flags = Store->getMemOperand()->getFlags();
11186 Store->getAlign(), Flags, &Fast) &&
11187 Fast > 1)
11188 return SDValue();
11189
11190 if (VT.isVector())
11191 return SplitVectorStore(Op, DAG);
11192
11193 return expandUnalignedStore(Store, DAG);
11194 }
11195
11196 // Probably an invalid store. If so we'll end up emitting a selection error.
11197 return SDValue();
11198}
11199
11200// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11201SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11202 SDLoc SL(Op);
11203 assert(!Subtarget->has16BitInsts());
11204 SDNodeFlags Flags = Op->getFlags();
11205 SDValue Ext =
11206 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11207
11208 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11209 SDValue Sqrt =
11210 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11211
11212 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11213 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11214}
11215
11216SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11217 SDLoc DL(Op);
11218 SDNodeFlags Flags = Op->getFlags();
11219 MVT VT = Op.getValueType().getSimpleVT();
11220 const SDValue X = Op.getOperand(0);
11221
11222 if (allowApproxFunc(DAG, Flags)) {
11223 // Instruction is 1ulp but ignores denormals.
11224 return DAG.getNode(
11226 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11227 }
11228
11229 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11230 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11231
11232 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11233
11234 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11235
11236 SDValue SqrtX =
11237 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11238
11239 SDValue SqrtS;
11240 if (needsDenormHandlingF32(DAG, X, Flags)) {
11241 SDValue SqrtID =
11242 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11243 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11244
11245 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11246 SDValue SqrtSNextDownInt =
11247 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11248 DAG.getAllOnesConstant(DL, MVT::i32));
11249 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11250
11251 SDValue NegSqrtSNextDown =
11252 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11253
11254 SDValue SqrtVP =
11255 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11256
11257 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11258 DAG.getConstant(1, DL, MVT::i32));
11259 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11260
11261 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11262 SDValue SqrtVS =
11263 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11264
11265 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11266 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11267
11268 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11269 Flags);
11270
11271 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11272 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11273 Flags);
11274 } else {
11275 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11276
11277 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11278
11279 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11280 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11281 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11282
11283 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11284 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11285 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11286
11287 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11288 SDValue SqrtD =
11289 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11290 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11291 }
11292
11293 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11294
11295 SDValue ScaledDown =
11296 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11297
11298 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11299 SDValue IsZeroOrInf =
11300 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11301 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11302
11303 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11304}
11305
11306SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11307 // For double type, the SQRT and RSQ instructions don't have required
11308 // precision, we apply Goldschmidt's algorithm to improve the result:
11309 //
11310 // y0 = rsq(x)
11311 // g0 = x * y0
11312 // h0 = 0.5 * y0
11313 //
11314 // r0 = 0.5 - h0 * g0
11315 // g1 = g0 * r0 + g0
11316 // h1 = h0 * r0 + h0
11317 //
11318 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11319 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11320 // h2 = h1 * r1 + h1
11321 //
11322 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11323 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11324 //
11325 // sqrt(x) = g3
11326
11327 SDNodeFlags Flags = Op->getFlags();
11328
11329 SDLoc DL(Op);
11330
11331 SDValue X = Op.getOperand(0);
11332 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11333
11334 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11335
11336 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11337
11338 // Scale up input if it is too small.
11339 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11340 SDValue ScaleUp =
11341 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11342 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11343
11344 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11345
11346 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11347
11348 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11349 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11350
11351 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11352 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11353
11354 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11355
11356 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11357
11358 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11359 SDValue SqrtD0 =
11360 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11361
11362 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11363
11364 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11365 SDValue SqrtD1 =
11366 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11367
11368 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11369
11370 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11371 SDValue ScaleDown =
11372 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11373 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11374
11375 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11376 // with finite only or nsz because rsq(+/-0) = +/-inf
11377
11378 // TODO: Check for DAZ and expand to subnormals
11379 SDValue IsZeroOrInf =
11380 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11381 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11382
11383 // If x is +INF, +0, or -0, use its original value
11384 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11385 Flags);
11386}
11387
11388SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11389 SDLoc DL(Op);
11390 EVT VT = Op.getValueType();
11391 SDValue Arg = Op.getOperand(0);
11392 SDValue TrigVal;
11393
11394 // Propagate fast-math flags so that the multiply we introduce can be folded
11395 // if Arg is already the result of a multiply by constant.
11396 auto Flags = Op->getFlags();
11397
11398 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11399
11400 if (Subtarget->hasTrigReducedRange()) {
11401 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11402 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11403 } else {
11404 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11405 }
11406
11407 switch (Op.getOpcode()) {
11408 case ISD::FCOS:
11409 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11410 case ISD::FSIN:
11411 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11412 default:
11413 llvm_unreachable("Wrong trig opcode");
11414 }
11415}
11416
11417SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11418 SelectionDAG &DAG) const {
11419 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11420 assert(AtomicNode->isCompareAndSwap());
11421 unsigned AS = AtomicNode->getAddressSpace();
11422
11423 // No custom lowering required for local address space
11425 return Op;
11426
11427 // Non-local address space requires custom lowering for atomic compare
11428 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11429 SDLoc DL(Op);
11430 SDValue ChainIn = Op.getOperand(0);
11431 SDValue Addr = Op.getOperand(1);
11432 SDValue Old = Op.getOperand(2);
11433 SDValue New = Op.getOperand(3);
11434 EVT VT = Op.getValueType();
11435 MVT SimpleVT = VT.getSimpleVT();
11436 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11437
11438 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11439 SDValue Ops[] = {ChainIn, Addr, NewOld};
11440
11442 Op->getVTList(), Ops, VT,
11443 AtomicNode->getMemOperand());
11444}
11445
11446//===----------------------------------------------------------------------===//
11447// Custom DAG optimizations
11448//===----------------------------------------------------------------------===//
11449
11450SDValue
11451SITargetLowering::performUCharToFloatCombine(SDNode *N,
11452 DAGCombinerInfo &DCI) const {
11453 EVT VT = N->getValueType(0);
11454 EVT ScalarVT = VT.getScalarType();
11455 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11456 return SDValue();
11457
11458 SelectionDAG &DAG = DCI.DAG;
11459 SDLoc DL(N);
11460
11461 SDValue Src = N->getOperand(0);
11462 EVT SrcVT = Src.getValueType();
11463
11464 // TODO: We could try to match extracting the higher bytes, which would be
11465 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11466 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11467 // about in practice.
11468 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11469 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11470 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11471 DCI.AddToWorklist(Cvt.getNode());
11472
11473 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11474 if (ScalarVT != MVT::f32) {
11475 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11476 DAG.getTargetConstant(0, DL, MVT::i32));
11477 }
11478 return Cvt;
11479 }
11480 }
11481
11482 return SDValue();
11483}
11484
11485SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11486 DAGCombinerInfo &DCI) const {
11487 SDValue MagnitudeOp = N->getOperand(0);
11488 SDValue SignOp = N->getOperand(1);
11489 SelectionDAG &DAG = DCI.DAG;
11490 SDLoc DL(N);
11491
11492 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11493 // lower half with a copy.
11494 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11495 if (MagnitudeOp.getValueType() == MVT::f64) {
11496 SDValue MagAsVector =
11497 DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11498 SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11499 MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11500 SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11501 MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11502
11503 SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11504
11505 SDValue Vector =
11506 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11507
11508 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11509 }
11510
11511 if (SignOp.getValueType() != MVT::f64)
11512 return SDValue();
11513
11514 // Reduce width of sign operand, we only need the highest bit.
11515 //
11516 // fcopysign f64:x, f64:y ->
11517 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11518 // TODO: In some cases it might make sense to go all the way to f16.
11519 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11520 SDValue SignAsF32 =
11521 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11522 DAG.getConstant(1, DL, MVT::i32));
11523
11524 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11525 SignAsF32);
11526}
11527
11528// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11529// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11530// bits
11531
11532// This is a variant of
11533// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11534//
11535// The normal DAG combiner will do this, but only if the add has one use since
11536// that would increase the number of instructions.
11537//
11538// This prevents us from seeing a constant offset that can be folded into a
11539// memory instruction's addressing mode. If we know the resulting add offset of
11540// a pointer can be folded into an addressing offset, we can replace the pointer
11541// operand with the add of new constant offset. This eliminates one of the uses,
11542// and may allow the remaining use to also be simplified.
11543//
11544SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
11545 EVT MemVT,
11546 DAGCombinerInfo &DCI) const {
11547 SDValue N0 = N->getOperand(0);
11548 SDValue N1 = N->getOperand(1);
11549
11550 // We only do this to handle cases where it's profitable when there are
11551 // multiple uses of the add, so defer to the standard combine.
11552 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11553 N0->hasOneUse())
11554 return SDValue();
11555
11556 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11557 if (!CN1)
11558 return SDValue();
11559
11560 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11561 if (!CAdd)
11562 return SDValue();
11563
11564 SelectionDAG &DAG = DCI.DAG;
11565
11566 if (N0->getOpcode() == ISD::OR &&
11567 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11568 return SDValue();
11569
11570 // If the resulting offset is too large, we can't fold it into the
11571 // addressing mode offset.
11572 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11573 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11574
11575 AddrMode AM;
11576 AM.HasBaseReg = true;
11577 AM.BaseOffs = Offset.getSExtValue();
11578 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11579 return SDValue();
11580
11581 SDLoc SL(N);
11582 EVT VT = N->getValueType(0);
11583
11584 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11585 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11586
11588 Flags.setNoUnsignedWrap(
11589 N->getFlags().hasNoUnsignedWrap() &&
11590 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
11591
11592 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11593}
11594
11595/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11596/// by the chain and intrinsic ID. Theoretically we would also need to check the
11597/// specific intrinsic, but they all place the pointer operand first.
11598static unsigned getBasePtrIndex(const MemSDNode *N) {
11599 switch (N->getOpcode()) {
11600 case ISD::STORE:
11603 return 2;
11604 default:
11605 return 1;
11606 }
11607}
11608
11609SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11610 DAGCombinerInfo &DCI) const {
11611 SelectionDAG &DAG = DCI.DAG;
11612 SDLoc SL(N);
11613
11614 unsigned PtrIdx = getBasePtrIndex(N);
11615 SDValue Ptr = N->getOperand(PtrIdx);
11616
11617 // TODO: We could also do this for multiplies.
11618 if (Ptr.getOpcode() == ISD::SHL) {
11619 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11620 N->getMemoryVT(), DCI);
11621 if (NewPtr) {
11622 SmallVector<SDValue, 8> NewOps(N->ops());
11623
11624 NewOps[PtrIdx] = NewPtr;
11625 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11626 }
11627 }
11628
11629 return SDValue();
11630}
11631
11632static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11633 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11634 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11635 (Opc == ISD::XOR && Val == 0);
11636}
11637
11638// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11639// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11640// integer combine opportunities since most 64-bit operations are decomposed
11641// this way. TODO: We won't want this for SALU especially if it is an inline
11642// immediate.
11643SDValue SITargetLowering::splitBinaryBitConstantOp(
11644 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
11645 const ConstantSDNode *CRHS) const {
11646 uint64_t Val = CRHS->getZExtValue();
11647 uint32_t ValLo = Lo_32(Val);
11648 uint32_t ValHi = Hi_32(Val);
11650
11651 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11652 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11653 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11654 // If we need to materialize a 64-bit immediate, it will be split up later
11655 // anyway. Avoid creating the harder to understand 64-bit immediate
11656 // materialization.
11657 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11658 }
11659
11660 return SDValue();
11661}
11662
11664 if (V.getValueType() != MVT::i1)
11665 return false;
11666 switch (V.getOpcode()) {
11667 default:
11668 break;
11669 case ISD::SETCC:
11671 return true;
11672 case ISD::AND:
11673 case ISD::OR:
11674 case ISD::XOR:
11675 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11676 }
11677 return false;
11678}
11679
11680// If a constant has all zeroes or all ones within each byte return it.
11681// Otherwise return 0.
11683 // 0xff for any zero byte in the mask
11684 uint32_t ZeroByteMask = 0;
11685 if (!(C & 0x000000ff))
11686 ZeroByteMask |= 0x000000ff;
11687 if (!(C & 0x0000ff00))
11688 ZeroByteMask |= 0x0000ff00;
11689 if (!(C & 0x00ff0000))
11690 ZeroByteMask |= 0x00ff0000;
11691 if (!(C & 0xff000000))
11692 ZeroByteMask |= 0xff000000;
11693 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11694 if ((NonZeroByteMask & C) != NonZeroByteMask)
11695 return 0; // Partial bytes selected.
11696 return C;
11697}
11698
11699// Check if a node selects whole bytes from its operand 0 starting at a byte
11700// boundary while masking the rest. Returns select mask as in the v_perm_b32
11701// or -1 if not succeeded.
11702// Note byte select encoding:
11703// value 0-3 selects corresponding source byte;
11704// value 0xc selects zero;
11705// value 0xff selects 0xff.
11707 assert(V.getValueSizeInBits() == 32);
11708
11709 if (V.getNumOperands() != 2)
11710 return ~0;
11711
11712 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11713 if (!N1)
11714 return ~0;
11715
11716 uint32_t C = N1->getZExtValue();
11717
11718 switch (V.getOpcode()) {
11719 default:
11720 break;
11721 case ISD::AND:
11722 if (uint32_t ConstMask = getConstantPermuteMask(C))
11723 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11724 break;
11725
11726 case ISD::OR:
11727 if (uint32_t ConstMask = getConstantPermuteMask(C))
11728 return (0x03020100 & ~ConstMask) | ConstMask;
11729 break;
11730
11731 case ISD::SHL:
11732 if (C % 8)
11733 return ~0;
11734
11735 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11736
11737 case ISD::SRL:
11738 if (C % 8)
11739 return ~0;
11740
11741 return uint32_t(0x0c0c0c0c03020100ull >> C);
11742 }
11743
11744 return ~0;
11745}
11746
11747SDValue SITargetLowering::performAndCombine(SDNode *N,
11748 DAGCombinerInfo &DCI) const {
11749 if (DCI.isBeforeLegalize())
11750 return SDValue();
11751
11752 SelectionDAG &DAG = DCI.DAG;
11753 EVT VT = N->getValueType(0);
11754 SDValue LHS = N->getOperand(0);
11755 SDValue RHS = N->getOperand(1);
11756
11757 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11758 if (VT == MVT::i64 && CRHS) {
11759 if (SDValue Split =
11760 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11761 return Split;
11762 }
11763
11764 if (CRHS && VT == MVT::i32) {
11765 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11766 // nb = number of trailing zeroes in mask
11767 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11768 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11769 uint64_t Mask = CRHS->getZExtValue();
11770 unsigned Bits = llvm::popcount(Mask);
11771 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11772 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11773 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11774 unsigned Shift = CShift->getZExtValue();
11775 unsigned NB = CRHS->getAPIntValue().countr_zero();
11776 unsigned Offset = NB + Shift;
11777 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11778 SDLoc SL(N);
11779 SDValue BFE =
11780 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
11781 DAG.getConstant(Offset, SL, MVT::i32),
11782 DAG.getConstant(Bits, SL, MVT::i32));
11783 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11784 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11785 DAG.getValueType(NarrowVT));
11786 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11787 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11788 return Shl;
11789 }
11790 }
11791 }
11792
11793 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11794 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11795 isa<ConstantSDNode>(LHS.getOperand(2))) {
11796 uint32_t Sel = getConstantPermuteMask(Mask);
11797 if (!Sel)
11798 return SDValue();
11799
11800 // Select 0xc for all zero bytes
11801 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11802 SDLoc DL(N);
11803 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11804 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11805 }
11806 }
11807
11808 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11809 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11810 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11811 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11812 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11813
11814 SDValue X = LHS.getOperand(0);
11815 SDValue Y = RHS.getOperand(0);
11816 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11817 !isTypeLegal(X.getValueType()))
11818 return SDValue();
11819
11820 if (LCC == ISD::SETO) {
11821 if (X != LHS.getOperand(1))
11822 return SDValue();
11823
11824 if (RCC == ISD::SETUNE) {
11825 const ConstantFPSDNode *C1 =
11826 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11827 if (!C1 || !C1->isInfinity() || C1->isNegative())
11828 return SDValue();
11829
11834
11835 static_assert(
11838 0x3ff) == Mask,
11839 "mask not equal");
11840
11841 SDLoc DL(N);
11842 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
11843 DAG.getConstant(Mask, DL, MVT::i32));
11844 }
11845 }
11846 }
11847
11848 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11849 std::swap(LHS, RHS);
11850
11851 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11852 RHS.hasOneUse()) {
11853 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11854 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11855 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11856 // | n_nan)
11857 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11858 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11859 (RHS.getOperand(0) == LHS.getOperand(0) &&
11860 LHS.getOperand(0) == LHS.getOperand(1))) {
11861 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11862 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
11863 : Mask->getZExtValue() & OrdMask;
11864
11865 SDLoc DL(N);
11866 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11867 DAG.getConstant(NewMask, DL, MVT::i32));
11868 }
11869 }
11870
11871 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
11872 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11873 // and x, (sext cc from i1) => select cc, x, 0
11874 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11875 std::swap(LHS, RHS);
11876 if (isBoolSGPR(RHS.getOperand(0)))
11877 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
11878 DAG.getConstant(0, SDLoc(N), MVT::i32));
11879 }
11880
11881 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11883 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11884 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11885 uint32_t LHSMask = getPermuteMask(LHS);
11886 uint32_t RHSMask = getPermuteMask(RHS);
11887 if (LHSMask != ~0u && RHSMask != ~0u) {
11888 // Canonicalize the expression in an attempt to have fewer unique masks
11889 // and therefore fewer registers used to hold the masks.
11890 if (LHSMask > RHSMask) {
11891 std::swap(LHSMask, RHSMask);
11892 std::swap(LHS, RHS);
11893 }
11894
11895 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11896 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11897 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11898 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11899
11900 // Check of we need to combine values from two sources within a byte.
11901 if (!(LHSUsedLanes & RHSUsedLanes) &&
11902 // If we select high and lower word keep it for SDWA.
11903 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11904 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11905 // Each byte in each mask is either selector mask 0-3, or has higher
11906 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11907 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11908 // mask which is not 0xff wins. By anding both masks we have a correct
11909 // result except that 0x0c shall be corrected to give 0x0c only.
11910 uint32_t Mask = LHSMask & RHSMask;
11911 for (unsigned I = 0; I < 32; I += 8) {
11912 uint32_t ByteSel = 0xff << I;
11913 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11914 Mask &= (0x0c << I) & 0xffffffff;
11915 }
11916
11917 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11918 // or 0x0c.
11919 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11920 SDLoc DL(N);
11921
11922 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11923 RHS.getOperand(0),
11924 DAG.getConstant(Sel, DL, MVT::i32));
11925 }
11926 }
11927 }
11928
11929 return SDValue();
11930}
11931
11932// A key component of v_perm is a mapping between byte position of the src
11933// operands, and the byte position of the dest. To provide such, we need: 1. the
11934// node that provides x byte of the dest of the OR, and 2. the byte of the node
11935// used to provide that x byte. calculateByteProvider finds which node provides
11936// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11937// and finds an ultimate src and byte position For example: The supported
11938// LoadCombine pattern for vector loads is as follows
11939// t1
11940// or
11941// / \
11942// t2 t3
11943// zext shl
11944// | | \
11945// t4 t5 16
11946// or anyext
11947// / \ |
11948// t6 t7 t8
11949// srl shl or
11950// / | / \ / \
11951// t9 t10 t11 t12 t13 t14
11952// trunc* 8 trunc* 8 and and
11953// | | / | | \
11954// t15 t16 t17 t18 t19 t20
11955// trunc* 255 srl -256
11956// | / \
11957// t15 t15 16
11958//
11959// *In this example, the truncs are from i32->i16
11960//
11961// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11962// respectively. calculateSrcByte would find (given node) -> ultimate src &
11963// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11964// After finding the mapping, we can combine the tree into vperm t15, t16,
11965// 0x05000407
11966
11967// Find the source and byte position from a node.
11968// \p DestByte is the byte position of the dest of the or that the src
11969// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11970// dest of the or byte. \p Depth tracks how many recursive iterations we have
11971// performed.
11972static const std::optional<ByteProvider<SDValue>>
11973calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11974 unsigned Depth = 0) {
11975 // We may need to recursively traverse a series of SRLs
11976 if (Depth >= 6)
11977 return std::nullopt;
11978
11979 if (Op.getValueSizeInBits() < 8)
11980 return std::nullopt;
11981
11982 if (Op.getValueType().isVector())
11983 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11984
11985 switch (Op->getOpcode()) {
11986 case ISD::TRUNCATE: {
11987 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11988 }
11989
11990 case ISD::SIGN_EXTEND:
11991 case ISD::ZERO_EXTEND:
11993 SDValue NarrowOp = Op->getOperand(0);
11994 auto NarrowVT = NarrowOp.getValueType();
11995 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11996 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11997 NarrowVT = VTSign->getVT();
11998 }
11999 if (!NarrowVT.isByteSized())
12000 return std::nullopt;
12001 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12002
12003 if (SrcIndex >= NarrowByteWidth)
12004 return std::nullopt;
12005 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12006 }
12007
12008 case ISD::SRA:
12009 case ISD::SRL: {
12010 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12011 if (!ShiftOp)
12012 return std::nullopt;
12013
12014 uint64_t BitShift = ShiftOp->getZExtValue();
12015
12016 if (BitShift % 8 != 0)
12017 return std::nullopt;
12018
12019 SrcIndex += BitShift / 8;
12020
12021 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12022 }
12023
12024 default: {
12025 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12026 }
12027 }
12028 llvm_unreachable("fully handled switch");
12029}
12030
12031// For a byte position in the result of an Or, traverse the tree and find the
12032// node (and the byte of the node) which ultimately provides this {Or,
12033// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12034// the byte position of the Op that corresponds with the originally requested
12035// byte of the Or \p Depth tracks how many recursive iterations we have
12036// performed. \p StartingIndex is the originally requested byte of the Or
12037static const std::optional<ByteProvider<SDValue>>
12038calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12039 unsigned StartingIndex = 0) {
12040 // Finding Src tree of RHS of or typically requires at least 1 additional
12041 // depth
12042 if (Depth > 6)
12043 return std::nullopt;
12044
12045 unsigned BitWidth = Op.getScalarValueSizeInBits();
12046 if (BitWidth % 8 != 0)
12047 return std::nullopt;
12048 if (Index > BitWidth / 8 - 1)
12049 return std::nullopt;
12050
12051 bool IsVec = Op.getValueType().isVector();
12052 switch (Op.getOpcode()) {
12053 case ISD::OR: {
12054 if (IsVec)
12055 return std::nullopt;
12056
12057 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12058 StartingIndex);
12059 if (!RHS)
12060 return std::nullopt;
12061 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12062 StartingIndex);
12063 if (!LHS)
12064 return std::nullopt;
12065 // A well formed Or will have two ByteProviders for each byte, one of which
12066 // is constant zero
12067 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12068 return std::nullopt;
12069 if (!LHS || LHS->isConstantZero())
12070 return RHS;
12071 if (!RHS || RHS->isConstantZero())
12072 return LHS;
12073 return std::nullopt;
12074 }
12075
12076 case ISD::AND: {
12077 if (IsVec)
12078 return std::nullopt;
12079
12080 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12081 if (!BitMaskOp)
12082 return std::nullopt;
12083
12084 uint32_t BitMask = BitMaskOp->getZExtValue();
12085 // Bits we expect for our StartingIndex
12086 uint32_t IndexMask = 0xFF << (Index * 8);
12087
12088 if ((IndexMask & BitMask) != IndexMask) {
12089 // If the result of the and partially provides the byte, then it
12090 // is not well formatted
12091 if (IndexMask & BitMask)
12092 return std::nullopt;
12094 }
12095
12096 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12097 }
12098
12099 case ISD::FSHR: {
12100 if (IsVec)
12101 return std::nullopt;
12102
12103 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12104 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12105 if (!ShiftOp || Op.getValueType().isVector())
12106 return std::nullopt;
12107
12108 uint64_t BitsProvided = Op.getValueSizeInBits();
12109 if (BitsProvided % 8 != 0)
12110 return std::nullopt;
12111
12112 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12113 if (BitShift % 8)
12114 return std::nullopt;
12115
12116 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12117 uint64_t ByteShift = BitShift / 8;
12118
12119 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12120 uint64_t BytesProvided = BitsProvided / 8;
12121 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12122 NewIndex %= BytesProvided;
12123 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12124 }
12125
12126 case ISD::SRA:
12127 case ISD::SRL: {
12128 if (IsVec)
12129 return std::nullopt;
12130
12131 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12132 if (!ShiftOp)
12133 return std::nullopt;
12134
12135 uint64_t BitShift = ShiftOp->getZExtValue();
12136 if (BitShift % 8)
12137 return std::nullopt;
12138
12139 auto BitsProvided = Op.getScalarValueSizeInBits();
12140 if (BitsProvided % 8 != 0)
12141 return std::nullopt;
12142
12143 uint64_t BytesProvided = BitsProvided / 8;
12144 uint64_t ByteShift = BitShift / 8;
12145 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12146 // If the byte we are trying to provide (as tracked by index) falls in this
12147 // range, then the SRL provides the byte. The byte of interest of the src of
12148 // the SRL is Index + ByteShift
12149 return BytesProvided - ByteShift > Index
12150 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12151 Index + ByteShift)
12153 }
12154
12155 case ISD::SHL: {
12156 if (IsVec)
12157 return std::nullopt;
12158
12159 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12160 if (!ShiftOp)
12161 return std::nullopt;
12162
12163 uint64_t BitShift = ShiftOp->getZExtValue();
12164 if (BitShift % 8 != 0)
12165 return std::nullopt;
12166 uint64_t ByteShift = BitShift / 8;
12167
12168 // If we are shifting by an amount greater than (or equal to)
12169 // the index we are trying to provide, then it provides 0s. If not,
12170 // then this bytes are not definitively 0s, and the corresponding byte
12171 // of interest is Index - ByteShift of the src
12172 return Index < ByteShift
12174 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12175 Depth + 1, StartingIndex);
12176 }
12177 case ISD::ANY_EXTEND:
12178 case ISD::SIGN_EXTEND:
12179 case ISD::ZERO_EXTEND:
12181 case ISD::AssertZext:
12182 case ISD::AssertSext: {
12183 if (IsVec)
12184 return std::nullopt;
12185
12186 SDValue NarrowOp = Op->getOperand(0);
12187 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12188 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12189 Op->getOpcode() == ISD::AssertZext ||
12190 Op->getOpcode() == ISD::AssertSext) {
12191 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12192 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12193 }
12194 if (NarrowBitWidth % 8 != 0)
12195 return std::nullopt;
12196 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12197
12198 if (Index >= NarrowByteWidth)
12199 return Op.getOpcode() == ISD::ZERO_EXTEND
12200 ? std::optional<ByteProvider<SDValue>>(
12202 : std::nullopt;
12203 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12204 }
12205
12206 case ISD::TRUNCATE: {
12207 if (IsVec)
12208 return std::nullopt;
12209
12210 uint64_t NarrowByteWidth = BitWidth / 8;
12211
12212 if (NarrowByteWidth >= Index) {
12213 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12214 StartingIndex);
12215 }
12216
12217 return std::nullopt;
12218 }
12219
12220 case ISD::CopyFromReg: {
12221 if (BitWidth / 8 > Index)
12222 return calculateSrcByte(Op, StartingIndex, Index);
12223
12224 return std::nullopt;
12225 }
12226
12227 case ISD::LOAD: {
12228 auto *L = cast<LoadSDNode>(Op.getNode());
12229
12230 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12231 if (NarrowBitWidth % 8 != 0)
12232 return std::nullopt;
12233 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12234
12235 // If the width of the load does not reach byte we are trying to provide for
12236 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12237 // question
12238 if (Index >= NarrowByteWidth) {
12239 return L->getExtensionType() == ISD::ZEXTLOAD
12240 ? std::optional<ByteProvider<SDValue>>(
12242 : std::nullopt;
12243 }
12244
12245 if (NarrowByteWidth > Index) {
12246 return calculateSrcByte(Op, StartingIndex, Index);
12247 }
12248
12249 return std::nullopt;
12250 }
12251
12252 case ISD::BSWAP: {
12253 if (IsVec)
12254 return std::nullopt;
12255
12256 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12257 Depth + 1, StartingIndex);
12258 }
12259
12261 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12262 if (!IdxOp)
12263 return std::nullopt;
12264 auto VecIdx = IdxOp->getZExtValue();
12265 auto ScalarSize = Op.getScalarValueSizeInBits();
12266 if (ScalarSize < 32)
12267 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12268 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12269 StartingIndex, Index);
12270 }
12271
12272 case AMDGPUISD::PERM: {
12273 if (IsVec)
12274 return std::nullopt;
12275
12276 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12277 if (!PermMask)
12278 return std::nullopt;
12279
12280 auto IdxMask =
12281 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12282 if (IdxMask > 0x07 && IdxMask != 0x0c)
12283 return std::nullopt;
12284
12285 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12286 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12287
12288 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12291 }
12292
12293 default: {
12294 return std::nullopt;
12295 }
12296 }
12297
12298 llvm_unreachable("fully handled switch");
12299}
12300
12301// Returns true if the Operand is a scalar and is 16 bits
12302static bool isExtendedFrom16Bits(SDValue &Operand) {
12303
12304 switch (Operand.getOpcode()) {
12305 case ISD::ANY_EXTEND:
12306 case ISD::SIGN_EXTEND:
12307 case ISD::ZERO_EXTEND: {
12308 auto OpVT = Operand.getOperand(0).getValueType();
12309 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12310 }
12311 case ISD::LOAD: {
12312 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12313 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12314 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12315 ExtType == ISD::EXTLOAD) {
12316 auto MemVT = L->getMemoryVT();
12317 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12318 }
12319 return L->getMemoryVT().getSizeInBits() == 16;
12320 }
12321 default:
12322 return false;
12323 }
12324}
12325
12326// Returns true if the mask matches consecutive bytes, and the first byte
12327// begins at a power of 2 byte offset from 0th byte
12328static bool addresses16Bits(int Mask) {
12329 int Low8 = Mask & 0xff;
12330 int Hi8 = (Mask & 0xff00) >> 8;
12331
12332 assert(Low8 < 8 && Hi8 < 8);
12333 // Are the bytes contiguous in the order of increasing addresses.
12334 bool IsConsecutive = (Hi8 - Low8 == 1);
12335 // Is the first byte at location that is aligned for 16 bit instructions.
12336 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12337 // In this case, we still need code to extract the 16 bit operand, so it
12338 // is better to use i8 v_perm
12339 bool Is16Aligned = !(Low8 % 2);
12340
12341 return IsConsecutive && Is16Aligned;
12342}
12343
12344// Do not lower into v_perm if the operands are actually 16 bit
12345// and the selected bits (based on PermMask) correspond with two
12346// easily addressable 16 bit operands.
12348 SDValue &OtherOp) {
12349 int Low16 = PermMask & 0xffff;
12350 int Hi16 = (PermMask & 0xffff0000) >> 16;
12351
12352 auto TempOp = peekThroughBitcasts(Op);
12353 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12354
12355 auto OpIs16Bit =
12356 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12357 if (!OpIs16Bit)
12358 return true;
12359
12360 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12361 isExtendedFrom16Bits(TempOtherOp);
12362 if (!OtherOpIs16Bit)
12363 return true;
12364
12365 // Do we cleanly address both
12366 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12367}
12368
12370 unsigned DWordOffset) {
12371 SDValue Ret;
12372
12373 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12374 // ByteProvider must be at least 8 bits
12375 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12376
12377 if (TypeSize <= 32)
12378 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12379
12380 if (Src.getValueType().isVector()) {
12381 auto ScalarTySize = Src.getScalarValueSizeInBits();
12382 auto ScalarTy = Src.getValueType().getScalarType();
12383 if (ScalarTySize == 32) {
12384 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12385 DAG.getConstant(DWordOffset, SL, MVT::i32));
12386 }
12387 if (ScalarTySize > 32) {
12388 Ret = DAG.getNode(
12389 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12390 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12391 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12392 if (ShiftVal)
12393 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12394 DAG.getConstant(ShiftVal, SL, MVT::i32));
12395 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12396 }
12397
12398 assert(ScalarTySize < 32);
12399 auto NumElements = TypeSize / ScalarTySize;
12400 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12401 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12402 auto NumElementsIn32 = 32 / ScalarTySize;
12403 auto NumAvailElements = DWordOffset < Trunc32Elements
12404 ? NumElementsIn32
12405 : NumElements - NormalizedTrunc;
12406
12408 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12409 NumAvailElements);
12410
12411 Ret = DAG.getBuildVector(
12412 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12413 VecSrcs);
12414 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12415 }
12416
12417 /// Scalar Type
12418 auto ShiftVal = 32 * DWordOffset;
12419 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12420 DAG.getConstant(ShiftVal, SL, MVT::i32));
12421 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12422}
12423
12425 SelectionDAG &DAG = DCI.DAG;
12426 [[maybe_unused]] EVT VT = N->getValueType(0);
12428
12429 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12430 assert(VT == MVT::i32);
12431 for (int i = 0; i < 4; i++) {
12432 // Find the ByteProvider that provides the ith byte of the result of OR
12433 std::optional<ByteProvider<SDValue>> P =
12434 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12435 // TODO support constantZero
12436 if (!P || P->isConstantZero())
12437 return SDValue();
12438
12439 PermNodes.push_back(*P);
12440 }
12441 if (PermNodes.size() != 4)
12442 return SDValue();
12443
12444 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12445 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12446 uint64_t PermMask = 0x00000000;
12447 for (size_t i = 0; i < PermNodes.size(); i++) {
12448 auto PermOp = PermNodes[i];
12449 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12450 // by sizeof(Src2) = 4
12451 int SrcByteAdjust = 4;
12452
12453 // If the Src uses a byte from a different DWORD, then it corresponds
12454 // with a difference source
12455 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12456 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12457 if (SecondSrc)
12458 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12459 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12460 return SDValue();
12461
12462 // Set the index of the second distinct Src node
12463 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12464 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12465 SrcByteAdjust = 0;
12466 }
12467 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12469 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12470 }
12471 SDLoc DL(N);
12472 SDValue Op = *PermNodes[FirstSrc.first].Src;
12473 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12474 assert(Op.getValueSizeInBits() == 32);
12475
12476 // Check that we are not just extracting the bytes in order from an op
12477 if (!SecondSrc) {
12478 int Low16 = PermMask & 0xffff;
12479 int Hi16 = (PermMask & 0xffff0000) >> 16;
12480
12481 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12482 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12483
12484 // The perm op would really just produce Op. So combine into Op
12485 if (WellFormedLow && WellFormedHi)
12486 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12487 }
12488
12489 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12490
12491 if (SecondSrc) {
12492 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12493 assert(OtherOp.getValueSizeInBits() == 32);
12494 }
12495
12496 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12497
12498 assert(Op.getValueType().isByteSized() &&
12499 OtherOp.getValueType().isByteSized());
12500
12501 // If the ultimate src is less than 32 bits, then we will only be
12502 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12503 // CalculateByteProvider would not have returned Op as source if we
12504 // used a byte that is outside its ValueType. Thus, we are free to
12505 // ANY_EXTEND as the extended bits are dont-cares.
12506 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12507 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12508
12509 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12510 DAG.getConstant(PermMask, DL, MVT::i32));
12511 }
12512 return SDValue();
12513}
12514
12515SDValue SITargetLowering::performOrCombine(SDNode *N,
12516 DAGCombinerInfo &DCI) const {
12517 SelectionDAG &DAG = DCI.DAG;
12518 SDValue LHS = N->getOperand(0);
12519 SDValue RHS = N->getOperand(1);
12520
12521 EVT VT = N->getValueType(0);
12522 if (VT == MVT::i1) {
12523 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12524 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12525 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12526 SDValue Src = LHS.getOperand(0);
12527 if (Src != RHS.getOperand(0))
12528 return SDValue();
12529
12530 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12531 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12532 if (!CLHS || !CRHS)
12533 return SDValue();
12534
12535 // Only 10 bits are used.
12536 static const uint32_t MaxMask = 0x3ff;
12537
12538 uint32_t NewMask =
12539 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12540 SDLoc DL(N);
12541 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
12542 DAG.getConstant(NewMask, DL, MVT::i32));
12543 }
12544
12545 return SDValue();
12546 }
12547
12548 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12549 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12550 LHS.getOpcode() == AMDGPUISD::PERM &&
12551 isa<ConstantSDNode>(LHS.getOperand(2))) {
12552 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12553 if (!Sel)
12554 return SDValue();
12555
12556 Sel |= LHS.getConstantOperandVal(2);
12557 SDLoc DL(N);
12558 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12559 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12560 }
12561
12562 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12564 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12565 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12566
12567 // If all the uses of an or need to extract the individual elements, do not
12568 // attempt to lower into v_perm
12569 auto usesCombinedOperand = [](SDNode *OrUse) {
12570 // If we have any non-vectorized use, then it is a candidate for v_perm
12571 if (OrUse->getOpcode() != ISD::BITCAST ||
12572 !OrUse->getValueType(0).isVector())
12573 return true;
12574
12575 // If we have any non-vectorized use, then it is a candidate for v_perm
12576 for (auto *VUser : OrUse->users()) {
12577 if (!VUser->getValueType(0).isVector())
12578 return true;
12579
12580 // If the use of a vector is a store, then combining via a v_perm
12581 // is beneficial.
12582 // TODO -- whitelist more uses
12583 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12584 if (VUser->getOpcode() == VectorwiseOp)
12585 return true;
12586 }
12587 return false;
12588 };
12589
12590 if (!any_of(N->users(), usesCombinedOperand))
12591 return SDValue();
12592
12593 uint32_t LHSMask = getPermuteMask(LHS);
12594 uint32_t RHSMask = getPermuteMask(RHS);
12595
12596 if (LHSMask != ~0u && RHSMask != ~0u) {
12597 // Canonicalize the expression in an attempt to have fewer unique masks
12598 // and therefore fewer registers used to hold the masks.
12599 if (LHSMask > RHSMask) {
12600 std::swap(LHSMask, RHSMask);
12601 std::swap(LHS, RHS);
12602 }
12603
12604 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12605 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12606 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12607 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12608
12609 // Check of we need to combine values from two sources within a byte.
12610 if (!(LHSUsedLanes & RHSUsedLanes) &&
12611 // If we select high and lower word keep it for SDWA.
12612 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12613 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12614 // Kill zero bytes selected by other mask. Zero value is 0xc.
12615 LHSMask &= ~RHSUsedLanes;
12616 RHSMask &= ~LHSUsedLanes;
12617 // Add 4 to each active LHS lane
12618 LHSMask |= LHSUsedLanes & 0x04040404;
12619 // Combine masks
12620 uint32_t Sel = LHSMask | RHSMask;
12621 SDLoc DL(N);
12622
12623 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12624 RHS.getOperand(0),
12625 DAG.getConstant(Sel, DL, MVT::i32));
12626 }
12627 }
12628 if (LHSMask == ~0u || RHSMask == ~0u) {
12629 if (SDValue Perm = matchPERM(N, DCI))
12630 return Perm;
12631 }
12632 }
12633
12634 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12635 return SDValue();
12636
12637 // TODO: This could be a generic combine with a predicate for extracting the
12638 // high half of an integer being free.
12639
12640 // (or i64:x, (zero_extend i32:y)) ->
12641 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12642 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12643 RHS.getOpcode() != ISD::ZERO_EXTEND)
12644 std::swap(LHS, RHS);
12645
12646 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12647 SDValue ExtSrc = RHS.getOperand(0);
12648 EVT SrcVT = ExtSrc.getValueType();
12649 if (SrcVT == MVT::i32) {
12650 SDLoc SL(N);
12651 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
12652 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12653
12654 DCI.AddToWorklist(LowOr.getNode());
12655 DCI.AddToWorklist(HiBits.getNode());
12656
12657 SDValue Vec =
12658 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
12659 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12660 }
12661 }
12662
12663 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12664 if (CRHS) {
12665 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12666 N->getOperand(0), CRHS))
12667 return Split;
12668 }
12669
12670 return SDValue();
12671}
12672
12673SDValue SITargetLowering::performXorCombine(SDNode *N,
12674 DAGCombinerInfo &DCI) const {
12675 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12676 return RV;
12677
12678 SDValue LHS = N->getOperand(0);
12679 SDValue RHS = N->getOperand(1);
12680
12681 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12682 SelectionDAG &DAG = DCI.DAG;
12683
12684 EVT VT = N->getValueType(0);
12685 if (CRHS && VT == MVT::i64) {
12686 if (SDValue Split =
12687 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12688 return Split;
12689 }
12690
12691 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12692 // fneg-like xors into 64-bit select.
12693 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12694 // This looks like an fneg, try to fold as a source modifier.
12695 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12696 shouldFoldFNegIntoSrc(N, LHS)) {
12697 // xor (select c, a, b), 0x80000000 ->
12698 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12699 SDLoc DL(N);
12700 SDValue CastLHS =
12701 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12702 SDValue CastRHS =
12703 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12704 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12705 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12706 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12707 LHS->getOperand(0), FNegLHS, FNegRHS);
12708 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12709 }
12710 }
12711
12712 return SDValue();
12713}
12714
12715SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12716 DAGCombinerInfo &DCI) const {
12717 if (!Subtarget->has16BitInsts() ||
12718 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12719 return SDValue();
12720
12721 EVT VT = N->getValueType(0);
12722 if (VT != MVT::i32)
12723 return SDValue();
12724
12725 SDValue Src = N->getOperand(0);
12726 if (Src.getValueType() != MVT::i16)
12727 return SDValue();
12728
12729 return SDValue();
12730}
12731
12732SDValue
12733SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12734 DAGCombinerInfo &DCI) const {
12735 SDValue Src = N->getOperand(0);
12736 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12737
12738 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12739 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12740 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12741 VTSign->getVT() == MVT::i8) ||
12742 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12743 VTSign->getVT() == MVT::i16))) {
12744 assert(Subtarget->hasScalarSubwordLoads() &&
12745 "s_buffer_load_{u8, i8} are supported "
12746 "in GFX12 (or newer) architectures.");
12747 EVT VT = Src.getValueType();
12748 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12751 SDLoc DL(N);
12752 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12753 SDValue Ops[] = {
12754 Src.getOperand(0), // source register
12755 Src.getOperand(1), // offset
12756 Src.getOperand(2) // cachePolicy
12757 };
12758 auto *M = cast<MemSDNode>(Src);
12759 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12760 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12761 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12762 return LoadVal;
12763 }
12764 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12765 VTSign->getVT() == MVT::i8) ||
12766 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12767 VTSign->getVT() == MVT::i16)) &&
12768 Src.hasOneUse()) {
12769 auto *M = cast<MemSDNode>(Src);
12770 SDValue Ops[] = {Src.getOperand(0), // Chain
12771 Src.getOperand(1), // rsrc
12772 Src.getOperand(2), // vindex
12773 Src.getOperand(3), // voffset
12774 Src.getOperand(4), // soffset
12775 Src.getOperand(5), // offset
12776 Src.getOperand(6), Src.getOperand(7)};
12777 // replace with BUFFER_LOAD_BYTE/SHORT
12778 SDVTList ResList =
12779 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12780 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
12783 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12784 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12785 return DCI.DAG.getMergeValues(
12786 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
12787 }
12788 return SDValue();
12789}
12790
12791SDValue SITargetLowering::performClassCombine(SDNode *N,
12792 DAGCombinerInfo &DCI) const {
12793 SelectionDAG &DAG = DCI.DAG;
12794 SDValue Mask = N->getOperand(1);
12795
12796 // fp_class x, 0 -> false
12797 if (isNullConstant(Mask))
12798 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12799
12800 if (N->getOperand(0).isUndef())
12801 return DAG.getUNDEF(MVT::i1);
12802
12803 return SDValue();
12804}
12805
12806SDValue SITargetLowering::performRcpCombine(SDNode *N,
12807 DAGCombinerInfo &DCI) const {
12808 EVT VT = N->getValueType(0);
12809 SDValue N0 = N->getOperand(0);
12810
12811 if (N0.isUndef()) {
12812 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12813 SDLoc(N), VT);
12814 }
12815
12816 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12817 N0.getOpcode() == ISD::SINT_TO_FP)) {
12818 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12819 N->getFlags());
12820 }
12821
12822 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12823 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12824 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12825 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
12826 N->getFlags());
12827 }
12828
12830}
12831
12833 unsigned MaxDepth) const {
12834 unsigned Opcode = Op.getOpcode();
12835 if (Opcode == ISD::FCANONICALIZE)
12836 return true;
12837
12838 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12839 const auto &F = CFP->getValueAPF();
12840 if (F.isNaN() && F.isSignaling())
12841 return false;
12842 if (!F.isDenormal())
12843 return true;
12844
12845 DenormalMode Mode =
12846 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12847 return Mode == DenormalMode::getIEEE();
12848 }
12849
12850 // If source is a result of another standard FP operation it is already in
12851 // canonical form.
12852 if (MaxDepth == 0)
12853 return false;
12854
12855 switch (Opcode) {
12856 // These will flush denorms if required.
12857 case ISD::FADD:
12858 case ISD::FSUB:
12859 case ISD::FMUL:
12860 case ISD::FCEIL:
12861 case ISD::FFLOOR:
12862 case ISD::FMA:
12863 case ISD::FMAD:
12864 case ISD::FSQRT:
12865 case ISD::FDIV:
12866 case ISD::FREM:
12867 case ISD::FP_ROUND:
12868 case ISD::FP_EXTEND:
12869 case ISD::FP16_TO_FP:
12870 case ISD::FP_TO_FP16:
12871 case ISD::BF16_TO_FP:
12872 case ISD::FP_TO_BF16:
12873 case ISD::FLDEXP:
12876 case AMDGPUISD::RCP:
12877 case AMDGPUISD::RSQ:
12881 case AMDGPUISD::LOG:
12882 case AMDGPUISD::EXP:
12886 case AMDGPUISD::FRACT:
12893 case AMDGPUISD::SIN_HW:
12894 case AMDGPUISD::COS_HW:
12895 return true;
12896
12897 // It can/will be lowered or combined as a bit operation.
12898 // Need to check their input recursively to handle.
12899 case ISD::FNEG:
12900 case ISD::FABS:
12901 case ISD::FCOPYSIGN:
12902 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12903
12904 case ISD::AND:
12905 if (Op.getValueType() == MVT::i32) {
12906 // Be careful as we only know it is a bitcast floating point type. It
12907 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12908 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12909 // is valid to optimize for all types.
12910 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12911 if (RHS->getZExtValue() == 0xffff0000) {
12912 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12913 }
12914 }
12915 }
12916 break;
12917
12918 case ISD::FSIN:
12919 case ISD::FCOS:
12920 case ISD::FSINCOS:
12921 return Op.getValueType().getScalarType() != MVT::f16;
12922
12923 case ISD::FMINNUM:
12924 case ISD::FMAXNUM:
12925 case ISD::FMINNUM_IEEE:
12926 case ISD::FMAXNUM_IEEE:
12927 case ISD::FMINIMUM:
12928 case ISD::FMAXIMUM:
12929 case AMDGPUISD::CLAMP:
12930 case AMDGPUISD::FMED3:
12931 case AMDGPUISD::FMAX3:
12932 case AMDGPUISD::FMIN3:
12934 case AMDGPUISD::FMINIMUM3: {
12935 // FIXME: Shouldn't treat the generic operations different based these.
12936 // However, we aren't really required to flush the result from
12937 // minnum/maxnum..
12938
12939 // snans will be quieted, so we only need to worry about denormals.
12940 if (Subtarget->supportsMinMaxDenormModes() ||
12941 // FIXME: denormalsEnabledForType is broken for dynamic
12942 denormalsEnabledForType(DAG, Op.getValueType()))
12943 return true;
12944
12945 // Flushing may be required.
12946 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12947 // targets need to check their input recursively.
12948
12949 // FIXME: Does this apply with clamp? It's implemented with max.
12950 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12951 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12952 return false;
12953 }
12954
12955 return true;
12956 }
12957 case ISD::SELECT: {
12958 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12959 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12960 }
12961 case ISD::BUILD_VECTOR: {
12962 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12963 SDValue SrcOp = Op.getOperand(i);
12964 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12965 return false;
12966 }
12967
12968 return true;
12969 }
12972 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12973 }
12975 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12976 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12977 }
12978 case ISD::UNDEF:
12979 // Could be anything.
12980 return false;
12981
12982 case ISD::BITCAST:
12983 // TODO: This is incorrect as it loses track of the operand's type. We may
12984 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12985 // same bits that are canonicalized in one type need not be in the other.
12986 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12987 case ISD::TRUNCATE: {
12988 // Hack round the mess we make when legalizing extract_vector_elt
12989 if (Op.getValueType() == MVT::i16) {
12990 SDValue TruncSrc = Op.getOperand(0);
12991 if (TruncSrc.getValueType() == MVT::i32 &&
12992 TruncSrc.getOpcode() == ISD::BITCAST &&
12993 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12994 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12995 }
12996 }
12997 return false;
12998 }
13000 unsigned IntrinsicID = Op.getConstantOperandVal(0);
13001 // TODO: Handle more intrinsics
13002 switch (IntrinsicID) {
13003 case Intrinsic::amdgcn_cvt_pkrtz:
13004 case Intrinsic::amdgcn_cubeid:
13005 case Intrinsic::amdgcn_frexp_mant:
13006 case Intrinsic::amdgcn_fdot2:
13007 case Intrinsic::amdgcn_rcp:
13008 case Intrinsic::amdgcn_rsq:
13009 case Intrinsic::amdgcn_rsq_clamp:
13010 case Intrinsic::amdgcn_rcp_legacy:
13011 case Intrinsic::amdgcn_rsq_legacy:
13012 case Intrinsic::amdgcn_trig_preop:
13013 case Intrinsic::amdgcn_log:
13014 case Intrinsic::amdgcn_exp2:
13015 case Intrinsic::amdgcn_sqrt:
13016 return true;
13017 default:
13018 break;
13019 }
13020
13021 break;
13022 }
13023 default:
13024 break;
13025 }
13026
13027 // FIXME: denormalsEnabledForType is broken for dynamic
13028 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13029 DAG.isKnownNeverSNaN(Op);
13030}
13031
13033 unsigned MaxDepth) const {
13034 const MachineRegisterInfo &MRI = MF.getRegInfo();
13035 MachineInstr *MI = MRI.getVRegDef(Reg);
13036 unsigned Opcode = MI->getOpcode();
13037
13038 if (Opcode == AMDGPU::G_FCANONICALIZE)
13039 return true;
13040
13041 std::optional<FPValueAndVReg> FCR;
13042 // Constant splat (can be padded with undef) or scalar constant.
13043 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13044 if (FCR->Value.isSignaling())
13045 return false;
13046 if (!FCR->Value.isDenormal())
13047 return true;
13048
13049 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13050 return Mode == DenormalMode::getIEEE();
13051 }
13052
13053 if (MaxDepth == 0)
13054 return false;
13055
13056 switch (Opcode) {
13057 case AMDGPU::G_FADD:
13058 case AMDGPU::G_FSUB:
13059 case AMDGPU::G_FMUL:
13060 case AMDGPU::G_FCEIL:
13061 case AMDGPU::G_FFLOOR:
13062 case AMDGPU::G_FRINT:
13063 case AMDGPU::G_FNEARBYINT:
13064 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13065 case AMDGPU::G_INTRINSIC_TRUNC:
13066 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13067 case AMDGPU::G_FMA:
13068 case AMDGPU::G_FMAD:
13069 case AMDGPU::G_FSQRT:
13070 case AMDGPU::G_FDIV:
13071 case AMDGPU::G_FREM:
13072 case AMDGPU::G_FPOW:
13073 case AMDGPU::G_FPEXT:
13074 case AMDGPU::G_FLOG:
13075 case AMDGPU::G_FLOG2:
13076 case AMDGPU::G_FLOG10:
13077 case AMDGPU::G_FPTRUNC:
13078 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13079 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13080 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13081 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13082 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13083 return true;
13084 case AMDGPU::G_FNEG:
13085 case AMDGPU::G_FABS:
13086 case AMDGPU::G_FCOPYSIGN:
13087 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13088 case AMDGPU::G_FMINNUM:
13089 case AMDGPU::G_FMAXNUM:
13090 case AMDGPU::G_FMINNUM_IEEE:
13091 case AMDGPU::G_FMAXNUM_IEEE:
13092 case AMDGPU::G_FMINIMUM:
13093 case AMDGPU::G_FMAXIMUM: {
13094 if (Subtarget->supportsMinMaxDenormModes() ||
13095 // FIXME: denormalsEnabledForType is broken for dynamic
13096 denormalsEnabledForType(MRI.getType(Reg), MF))
13097 return true;
13098
13099 [[fallthrough]];
13100 }
13101 case AMDGPU::G_BUILD_VECTOR:
13102 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13103 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13104 return false;
13105 return true;
13106 case AMDGPU::G_INTRINSIC:
13107 case AMDGPU::G_INTRINSIC_CONVERGENT:
13108 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13109 case Intrinsic::amdgcn_fmul_legacy:
13110 case Intrinsic::amdgcn_fmad_ftz:
13111 case Intrinsic::amdgcn_sqrt:
13112 case Intrinsic::amdgcn_fmed3:
13113 case Intrinsic::amdgcn_sin:
13114 case Intrinsic::amdgcn_cos:
13115 case Intrinsic::amdgcn_log:
13116 case Intrinsic::amdgcn_exp2:
13117 case Intrinsic::amdgcn_log_clamp:
13118 case Intrinsic::amdgcn_rcp:
13119 case Intrinsic::amdgcn_rcp_legacy:
13120 case Intrinsic::amdgcn_rsq:
13121 case Intrinsic::amdgcn_rsq_clamp:
13122 case Intrinsic::amdgcn_rsq_legacy:
13123 case Intrinsic::amdgcn_div_scale:
13124 case Intrinsic::amdgcn_div_fmas:
13125 case Intrinsic::amdgcn_div_fixup:
13126 case Intrinsic::amdgcn_fract:
13127 case Intrinsic::amdgcn_cvt_pkrtz:
13128 case Intrinsic::amdgcn_cubeid:
13129 case Intrinsic::amdgcn_cubema:
13130 case Intrinsic::amdgcn_cubesc:
13131 case Intrinsic::amdgcn_cubetc:
13132 case Intrinsic::amdgcn_frexp_mant:
13133 case Intrinsic::amdgcn_fdot2:
13134 case Intrinsic::amdgcn_trig_preop:
13135 return true;
13136 default:
13137 break;
13138 }
13139
13140 [[fallthrough]];
13141 default:
13142 return false;
13143 }
13144
13145 llvm_unreachable("invalid operation");
13146}
13147
13148// Constant fold canonicalize.
13149SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13150 const SDLoc &SL, EVT VT,
13151 const APFloat &C) const {
13152 // Flush denormals to 0 if not enabled.
13153 if (C.isDenormal()) {
13154 DenormalMode Mode =
13155 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13156 if (Mode == DenormalMode::getPreserveSign()) {
13157 return DAG.getConstantFP(
13158 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13159 }
13160
13161 if (Mode != DenormalMode::getIEEE())
13162 return SDValue();
13163 }
13164
13165 if (C.isNaN()) {
13166 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13167 if (C.isSignaling()) {
13168 // Quiet a signaling NaN.
13169 // FIXME: Is this supposed to preserve payload bits?
13170 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13171 }
13172
13173 // Make sure it is the canonical NaN bitpattern.
13174 //
13175 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13176 // immediate?
13177 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13178 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13179 }
13180
13181 // Already canonical.
13182 return DAG.getConstantFP(C, SL, VT);
13183}
13184
13186 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13187}
13188
13189SDValue
13190SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13191 DAGCombinerInfo &DCI) const {
13192 SelectionDAG &DAG = DCI.DAG;
13193 SDValue N0 = N->getOperand(0);
13194 EVT VT = N->getValueType(0);
13195
13196 // fcanonicalize undef -> qnan
13197 if (N0.isUndef()) {
13199 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13200 }
13201
13202 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13203 EVT VT = N->getValueType(0);
13204 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13205 }
13206
13207 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13208 // (fcanonicalize k)
13209 //
13210 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13211
13212 // TODO: This could be better with wider vectors that will be split to v2f16,
13213 // and to consider uses since there aren't that many packed operations.
13214 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13215 isTypeLegal(MVT::v2f16)) {
13216 SDLoc SL(N);
13217 SDValue NewElts[2];
13218 SDValue Lo = N0.getOperand(0);
13219 SDValue Hi = N0.getOperand(1);
13220 EVT EltVT = Lo.getValueType();
13221
13223 for (unsigned I = 0; I != 2; ++I) {
13224 SDValue Op = N0.getOperand(I);
13225 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13226 NewElts[I] =
13227 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13228 } else if (Op.isUndef()) {
13229 // Handled below based on what the other operand is.
13230 NewElts[I] = Op;
13231 } else {
13232 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13233 }
13234 }
13235
13236 // If one half is undef, and one is constant, prefer a splat vector rather
13237 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13238 // cheaper to use and may be free with a packed operation.
13239 if (NewElts[0].isUndef()) {
13240 if (isa<ConstantFPSDNode>(NewElts[1]))
13241 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13242 ? NewElts[1]
13243 : DAG.getConstantFP(0.0f, SL, EltVT);
13244 }
13245
13246 if (NewElts[1].isUndef()) {
13247 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13248 ? NewElts[0]
13249 : DAG.getConstantFP(0.0f, SL, EltVT);
13250 }
13251
13252 return DAG.getBuildVector(VT, SL, NewElts);
13253 }
13254 }
13255
13256 return SDValue();
13257}
13258
13259static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13260 switch (Opc) {
13261 case ISD::FMAXNUM:
13262 case ISD::FMAXNUM_IEEE:
13263 return AMDGPUISD::FMAX3;
13264 case ISD::FMAXIMUM:
13265 return AMDGPUISD::FMAXIMUM3;
13266 case ISD::SMAX:
13267 return AMDGPUISD::SMAX3;
13268 case ISD::UMAX:
13269 return AMDGPUISD::UMAX3;
13270 case ISD::FMINNUM:
13271 case ISD::FMINNUM_IEEE:
13272 return AMDGPUISD::FMIN3;
13273 case ISD::FMINIMUM:
13274 return AMDGPUISD::FMINIMUM3;
13275 case ISD::SMIN:
13276 return AMDGPUISD::SMIN3;
13277 case ISD::UMIN:
13278 return AMDGPUISD::UMIN3;
13279 default:
13280 llvm_unreachable("Not a min/max opcode");
13281 }
13282}
13283
13284SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13285 const SDLoc &SL, SDValue Src,
13286 SDValue MinVal,
13287 SDValue MaxVal,
13288 bool Signed) const {
13289
13290 // med3 comes from
13291 // min(max(x, K0), K1), K0 < K1
13292 // max(min(x, K0), K1), K1 < K0
13293 //
13294 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13295 // min/max op.
13296 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13297 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13298
13299 if (!MinK || !MaxK)
13300 return SDValue();
13301
13302 if (Signed) {
13303 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13304 return SDValue();
13305 } else {
13306 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13307 return SDValue();
13308 }
13309
13310 EVT VT = MinK->getValueType(0);
13311 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13312 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13313 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13314
13315 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13316 // not available, but this is unlikely to be profitable as constants
13317 // will often need to be materialized & extended, especially on
13318 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13319 return SDValue();
13320}
13321
13323 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13324 return C;
13325
13326 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13327 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13328 return C;
13329 }
13330
13331 return nullptr;
13332}
13333
13334SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13335 const SDLoc &SL, SDValue Op0,
13336 SDValue Op1) const {
13338 if (!K1)
13339 return SDValue();
13340
13342 if (!K0)
13343 return SDValue();
13344
13345 // Ordered >= (although NaN inputs should have folded away by now).
13346 if (K0->getValueAPF() > K1->getValueAPF())
13347 return SDValue();
13348
13349 const MachineFunction &MF = DAG.getMachineFunction();
13351
13352 // TODO: Check IEEE bit enabled?
13353 EVT VT = Op0.getValueType();
13354 if (Info->getMode().DX10Clamp) {
13355 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13356 // hardware fmed3 behavior converting to a min.
13357 // FIXME: Should this be allowing -0.0?
13358 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13359 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13360 }
13361
13362 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13363 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13364 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13365 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13366 // then give the other result, which is different from med3 with a NaN
13367 // input.
13368 SDValue Var = Op0.getOperand(0);
13369 if (!DAG.isKnownNeverSNaN(Var))
13370 return SDValue();
13371
13373
13374 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13375 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13376 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13377 SDValue(K0, 0), SDValue(K1, 0));
13378 }
13379 }
13380
13381 return SDValue();
13382}
13383
13384/// \return true if the subtarget supports minimum3 and maximum3 with the given
13385/// base min/max opcode \p Opc for type \p VT.
13386static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13387 EVT VT) {
13388 switch (Opc) {
13389 case ISD::FMINNUM:
13390 case ISD::FMAXNUM:
13391 case ISD::FMINNUM_IEEE:
13392 case ISD::FMAXNUM_IEEE:
13395 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13396 case ISD::FMINIMUM:
13397 case ISD::FMAXIMUM:
13398 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13399 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
13400 case ISD::SMAX:
13401 case ISD::SMIN:
13402 case ISD::UMAX:
13403 case ISD::UMIN:
13404 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13405 default:
13406 return false;
13407 }
13408
13409 llvm_unreachable("not a min/max opcode");
13410}
13411
13412SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13413 DAGCombinerInfo &DCI) const {
13414 SelectionDAG &DAG = DCI.DAG;
13415
13416 EVT VT = N->getValueType(0);
13417 unsigned Opc = N->getOpcode();
13418 SDValue Op0 = N->getOperand(0);
13419 SDValue Op1 = N->getOperand(1);
13420
13421 // Only do this if the inner op has one use since this will just increases
13422 // register pressure for no benefit.
13423
13424 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13425 // max(max(a, b), c) -> max3(a, b, c)
13426 // min(min(a, b), c) -> min3(a, b, c)
13427 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13428 SDLoc DL(N);
13429 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13430 Op0.getOperand(0), Op0.getOperand(1), Op1);
13431 }
13432
13433 // Try commuted.
13434 // max(a, max(b, c)) -> max3(a, b, c)
13435 // min(a, min(b, c)) -> min3(a, b, c)
13436 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13437 SDLoc DL(N);
13438 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13439 Op0, Op1.getOperand(0), Op1.getOperand(1));
13440 }
13441 }
13442
13443 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13444 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13445 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13446 if (SDValue Med3 = performIntMed3ImmCombine(
13447 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13448 return Med3;
13449 }
13450 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13451 if (SDValue Med3 = performIntMed3ImmCombine(
13452 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13453 return Med3;
13454 }
13455
13456 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13457 if (SDValue Med3 = performIntMed3ImmCombine(
13458 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13459 return Med3;
13460 }
13461 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13462 if (SDValue Med3 = performIntMed3ImmCombine(
13463 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13464 return Med3;
13465 }
13466
13467 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13468 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13469 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13470 (Opc == AMDGPUISD::FMIN_LEGACY &&
13471 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13472 (VT == MVT::f32 || VT == MVT::f64 ||
13473 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13474 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13475 Op0.hasOneUse()) {
13476 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13477 return Res;
13478 }
13479
13480 return SDValue();
13481}
13482
13484 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13485 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13486 // FIXME: Should this be allowing -0.0?
13487 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13488 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13489 }
13490 }
13491
13492 return false;
13493}
13494
13495// FIXME: Should only worry about snans for version with chain.
13496SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13497 DAGCombinerInfo &DCI) const {
13498 EVT VT = N->getValueType(0);
13499 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13500 // NaNs. With a NaN input, the order of the operands may change the result.
13501
13502 SelectionDAG &DAG = DCI.DAG;
13503 SDLoc SL(N);
13504
13505 SDValue Src0 = N->getOperand(0);
13506 SDValue Src1 = N->getOperand(1);
13507 SDValue Src2 = N->getOperand(2);
13508
13509 if (isClampZeroToOne(Src0, Src1)) {
13510 // const_a, const_b, x -> clamp is safe in all cases including signaling
13511 // nans.
13512 // FIXME: Should this be allowing -0.0?
13513 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13514 }
13515
13516 const MachineFunction &MF = DAG.getMachineFunction();
13518
13519 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13520 // handling no dx10-clamp?
13521 if (Info->getMode().DX10Clamp) {
13522 // If NaNs is clamped to 0, we are free to reorder the inputs.
13523
13524 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13525 std::swap(Src0, Src1);
13526
13527 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13528 std::swap(Src1, Src2);
13529
13530 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13531 std::swap(Src0, Src1);
13532
13533 if (isClampZeroToOne(Src1, Src2))
13534 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13535 }
13536
13537 return SDValue();
13538}
13539
13540SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13541 DAGCombinerInfo &DCI) const {
13542 SDValue Src0 = N->getOperand(0);
13543 SDValue Src1 = N->getOperand(1);
13544 if (Src0.isUndef() && Src1.isUndef())
13545 return DCI.DAG.getUNDEF(N->getValueType(0));
13546 return SDValue();
13547}
13548
13549// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13550// expanded into a set of cmp/select instructions.
13552 unsigned NumElem,
13553 bool IsDivergentIdx,
13554 const GCNSubtarget *Subtarget) {
13556 return false;
13557
13558 unsigned VecSize = EltSize * NumElem;
13559
13560 // Sub-dword vectors of size 2 dword or less have better implementation.
13561 if (VecSize <= 64 && EltSize < 32)
13562 return false;
13563
13564 // Always expand the rest of sub-dword instructions, otherwise it will be
13565 // lowered via memory.
13566 if (EltSize < 32)
13567 return true;
13568
13569 // Always do this if var-idx is divergent, otherwise it will become a loop.
13570 if (IsDivergentIdx)
13571 return true;
13572
13573 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13574 unsigned NumInsts = NumElem /* Number of compares */ +
13575 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13576
13577 // On some architectures (GFX9) movrel is not available and it's better
13578 // to expand.
13579 if (Subtarget->useVGPRIndexMode())
13580 return NumInsts <= 16;
13581
13582 // If movrel is available, use it instead of expanding for vector of 8
13583 // elements.
13584 if (Subtarget->hasMovrel())
13585 return NumInsts <= 15;
13586
13587 return true;
13588}
13589
13591 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13592 if (isa<ConstantSDNode>(Idx))
13593 return false;
13594
13595 SDValue Vec = N->getOperand(0);
13596 EVT VecVT = Vec.getValueType();
13597 EVT EltVT = VecVT.getVectorElementType();
13598 unsigned EltSize = EltVT.getSizeInBits();
13599 unsigned NumElem = VecVT.getVectorNumElements();
13600
13602 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13603}
13604
13605SDValue
13606SITargetLowering::performExtractVectorEltCombine(SDNode *N,
13607 DAGCombinerInfo &DCI) const {
13608 SDValue Vec = N->getOperand(0);
13609 SelectionDAG &DAG = DCI.DAG;
13610
13611 EVT VecVT = Vec.getValueType();
13612 EVT VecEltVT = VecVT.getVectorElementType();
13613 EVT ResVT = N->getValueType(0);
13614
13615 unsigned VecSize = VecVT.getSizeInBits();
13616 unsigned VecEltSize = VecEltVT.getSizeInBits();
13617
13618 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
13620 SDLoc SL(N);
13621 SDValue Idx = N->getOperand(1);
13622 SDValue Elt =
13623 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13624 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13625 }
13626
13627 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13628 // =>
13629 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13630 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13631 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13632 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13633 SDLoc SL(N);
13634 SDValue Idx = N->getOperand(1);
13635 unsigned Opc = Vec.getOpcode();
13636
13637 switch (Opc) {
13638 default:
13639 break;
13640 // TODO: Support other binary operations.
13641 case ISD::FADD:
13642 case ISD::FSUB:
13643 case ISD::FMUL:
13644 case ISD::ADD:
13645 case ISD::UMIN:
13646 case ISD::UMAX:
13647 case ISD::SMIN:
13648 case ISD::SMAX:
13649 case ISD::FMAXNUM:
13650 case ISD::FMINNUM:
13651 case ISD::FMAXNUM_IEEE:
13652 case ISD::FMINNUM_IEEE:
13653 case ISD::FMAXIMUM:
13654 case ISD::FMINIMUM: {
13655 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13656 Vec.getOperand(0), Idx);
13657 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13658 Vec.getOperand(1), Idx);
13659
13660 DCI.AddToWorklist(Elt0.getNode());
13661 DCI.AddToWorklist(Elt1.getNode());
13662 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13663 }
13664 }
13665 }
13666
13667 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13669 SDLoc SL(N);
13670 SDValue Idx = N->getOperand(1);
13671 SDValue V;
13672 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13673 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13674 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13675 if (I == 0)
13676 V = Elt;
13677 else
13678 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13679 }
13680 return V;
13681 }
13682
13683 if (!DCI.isBeforeLegalize())
13684 return SDValue();
13685
13686 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13687 // elements. This exposes more load reduction opportunities by replacing
13688 // multiple small extract_vector_elements with a single 32-bit extract.
13689 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13690 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13691 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13692 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13693
13694 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13695 unsigned EltIdx = BitIndex / 32;
13696 unsigned LeftoverBitIdx = BitIndex % 32;
13697 SDLoc SL(N);
13698
13699 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13700 DCI.AddToWorklist(Cast.getNode());
13701
13702 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13703 DAG.getConstant(EltIdx, SL, MVT::i32));
13704 DCI.AddToWorklist(Elt.getNode());
13705 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13706 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13707 DCI.AddToWorklist(Srl.getNode());
13708
13709 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13710 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13711 DCI.AddToWorklist(Trunc.getNode());
13712
13713 if (VecEltVT == ResVT) {
13714 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13715 }
13716
13717 assert(ResVT.isScalarInteger());
13718 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13719 }
13720
13721 return SDValue();
13722}
13723
13724SDValue
13725SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13726 DAGCombinerInfo &DCI) const {
13727 SDValue Vec = N->getOperand(0);
13728 SDValue Idx = N->getOperand(2);
13729 EVT VecVT = Vec.getValueType();
13730 EVT EltVT = VecVT.getVectorElementType();
13731
13732 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13733 // => BUILD_VECTOR n x select (e, const-idx)
13735 return SDValue();
13736
13737 SelectionDAG &DAG = DCI.DAG;
13738 SDLoc SL(N);
13739 SDValue Ins = N->getOperand(1);
13740 EVT IdxVT = Idx.getValueType();
13741
13743 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13744 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13745 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13746 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13747 Ops.push_back(V);
13748 }
13749
13750 return DAG.getBuildVector(VecVT, SL, Ops);
13751}
13752
13753/// Return the source of an fp_extend from f16 to f32, or a converted FP
13754/// constant.
13756 if (Src.getOpcode() == ISD::FP_EXTEND &&
13757 Src.getOperand(0).getValueType() == MVT::f16) {
13758 return Src.getOperand(0);
13759 }
13760
13761 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13762 APFloat Val = CFP->getValueAPF();
13763 bool LosesInfo = true;
13765 if (!LosesInfo)
13766 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13767 }
13768
13769 return SDValue();
13770}
13771
13772SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13773 DAGCombinerInfo &DCI) const {
13774 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13775 "combine only useful on gfx8");
13776
13777 SDValue TruncSrc = N->getOperand(0);
13778 EVT VT = N->getValueType(0);
13779 if (VT != MVT::f16)
13780 return SDValue();
13781
13782 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13783 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13784 return SDValue();
13785
13786 SelectionDAG &DAG = DCI.DAG;
13787 SDLoc SL(N);
13788
13789 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13790 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13791 // casting back.
13792
13793 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13794 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13795 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13796 if (!A)
13797 return SDValue();
13798
13799 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13800 if (!B)
13801 return SDValue();
13802
13803 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13804 if (!C)
13805 return SDValue();
13806
13807 // This changes signaling nan behavior. If an input is a signaling nan, it
13808 // would have been quieted by the fpext originally. We don't care because
13809 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13810 // we would be worse off than just doing the promotion.
13811 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13812 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13813 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13814 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13815}
13816
13817unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13818 const SDNode *N0,
13819 const SDNode *N1) const {
13820 EVT VT = N0->getValueType(0);
13821
13822 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13823 // support denormals ever.
13824 if (((VT == MVT::f32 &&
13826 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13829 return ISD::FMAD;
13830
13831 const TargetOptions &Options = DAG.getTarget().Options;
13832 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13833 (N0->getFlags().hasAllowContract() &&
13834 N1->getFlags().hasAllowContract())) &&
13836 return ISD::FMA;
13837 }
13838
13839 return 0;
13840}
13841
13842// For a reassociatable opcode perform:
13843// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13844SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13845 SelectionDAG &DAG) const {
13846 EVT VT = N->getValueType(0);
13847 if (VT != MVT::i32 && VT != MVT::i64)
13848 return SDValue();
13849
13850 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13851 return SDValue();
13852
13853 unsigned Opc = N->getOpcode();
13854 SDValue Op0 = N->getOperand(0);
13855 SDValue Op1 = N->getOperand(1);
13856
13857 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13858 return SDValue();
13859
13860 if (Op0->isDivergent())
13861 std::swap(Op0, Op1);
13862
13863 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13864 return SDValue();
13865
13866 SDValue Op2 = Op1.getOperand(1);
13867 Op1 = Op1.getOperand(0);
13868 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13869 return SDValue();
13870
13871 if (Op1->isDivergent())
13872 std::swap(Op1, Op2);
13873
13874 SDLoc SL(N);
13875 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13876 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13877}
13878
13879static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
13880 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
13882 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13883 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13884 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13885}
13886
13887// Fold
13888// y = lshr i64 x, 32
13889// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
13890// with Const.hi == -1
13891// To
13892// res = mad_u64_u32 y.lo ,Const.lo, x.lo
13894 SDValue MulLHS, SDValue MulRHS,
13895 SDValue AddRHS) {
13896 if (MulRHS.getOpcode() == ISD::SRL)
13897 std::swap(MulLHS, MulRHS);
13898
13899 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
13900 return SDValue();
13901
13902 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
13903 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
13904 MulLHS.getOperand(0) != AddRHS)
13905 return SDValue();
13906
13907 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
13908 if (!Const || Hi_32(Const->getZExtValue()) != -1)
13909 return SDValue();
13910
13911 SDValue ConstMul =
13912 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
13913 return getMad64_32(DAG, SL, MVT::i64,
13914 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
13915 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
13916}
13917
13918// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13919// multiplies, if any.
13920//
13921// Full 64-bit multiplies that feed into an addition are lowered here instead
13922// of using the generic expansion. The generic expansion ends up with
13923// a tree of ADD nodes that prevents us from using the "add" part of the
13924// MAD instruction. The expansion produced here results in a chain of ADDs
13925// instead of a tree.
13926SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13927 DAGCombinerInfo &DCI) const {
13928 assert(N->getOpcode() == ISD::ADD);
13929
13930 SelectionDAG &DAG = DCI.DAG;
13931 EVT VT = N->getValueType(0);
13932 SDLoc SL(N);
13933 SDValue LHS = N->getOperand(0);
13934 SDValue RHS = N->getOperand(1);
13935
13936 if (VT.isVector())
13937 return SDValue();
13938
13939 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13940 // result in scalar registers for uniform values.
13941 if (!N->isDivergent() && Subtarget->hasSMulHi())
13942 return SDValue();
13943
13944 unsigned NumBits = VT.getScalarSizeInBits();
13945 if (NumBits <= 32 || NumBits > 64)
13946 return SDValue();
13947
13948 if (LHS.getOpcode() != ISD::MUL) {
13949 assert(RHS.getOpcode() == ISD::MUL);
13950 std::swap(LHS, RHS);
13951 }
13952
13953 // Avoid the fold if it would unduly increase the number of multiplies due to
13954 // multiple uses, except on hardware with full-rate multiply-add (which is
13955 // part of full-rate 64-bit ops).
13956 if (!Subtarget->hasFullRate64Ops()) {
13957 unsigned NumUsers = 0;
13958 for (SDNode *User : LHS->users()) {
13959 // There is a use that does not feed into addition, so the multiply can't
13960 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13961 if (User->getOpcode() != ISD::ADD)
13962 return SDValue();
13963
13964 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13965 // MUL + 3xADD + 3xADDC over 3xMAD.
13966 ++NumUsers;
13967 if (NumUsers >= 3)
13968 return SDValue();
13969 }
13970 }
13971
13972 SDValue MulLHS = LHS.getOperand(0);
13973 SDValue MulRHS = LHS.getOperand(1);
13974 SDValue AddRHS = RHS;
13975
13976 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
13977 return FoldedMAD;
13978
13979 // Always check whether operands are small unsigned values, since that
13980 // knowledge is useful in more cases. Check for small signed values only if
13981 // doing so can unlock a shorter code sequence.
13982 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13983 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13984
13985 bool MulSignedLo = false;
13986 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13987 MulSignedLo =
13988 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
13989 }
13990
13991 // The operands and final result all have the same number of bits. If
13992 // operands need to be extended, they can be extended with garbage. The
13993 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13994 // truncated away in the end.
13995 if (VT != MVT::i64) {
13996 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13997 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13998 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13999 }
14000
14001 // The basic code generated is conceptually straightforward. Pseudo code:
14002 //
14003 // accum = mad_64_32 lhs.lo, rhs.lo, accum
14004 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14005 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14006 //
14007 // The second and third lines are optional, depending on whether the factors
14008 // are {sign,zero}-extended or not.
14009 //
14010 // The actual DAG is noisier than the pseudo code, but only due to
14011 // instructions that disassemble values into low and high parts, and
14012 // assemble the final result.
14013 SDValue One = DAG.getConstant(1, SL, MVT::i32);
14014
14015 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14016 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14017 SDValue Accum =
14018 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14019
14020 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14021 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14022
14023 if (!MulLHSUnsigned32) {
14024 auto MulLHSHi =
14025 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14026 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14027 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14028 }
14029
14030 if (!MulRHSUnsigned32) {
14031 auto MulRHSHi =
14032 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
14033 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
14034 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14035 }
14036
14037 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
14038 Accum = DAG.getBitcast(MVT::i64, Accum);
14039 }
14040
14041 if (VT != MVT::i64)
14042 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14043 return Accum;
14044}
14045
14046SDValue
14047SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14048 DAGCombinerInfo &DCI) const {
14049 SDValue RHS = N->getOperand(1);
14050 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14051 if (!CRHS)
14052 return SDValue();
14053
14054 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14055 // common.
14056 uint64_t Val = CRHS->getZExtValue();
14057 if (countr_zero(Val) >= 32) {
14058 SelectionDAG &DAG = DCI.DAG;
14059 SDLoc SL(N);
14060 SDValue LHS = N->getOperand(0);
14061
14062 // Avoid carry machinery if we know the low half of the add does not
14063 // contribute to the final result.
14064 //
14065 // add i64:x, K if computeTrailingZeros(K) >= 32
14066 // => build_pair (add x.hi, K.hi), x.lo
14067
14068 // Breaking the 64-bit add here with this strange constant is unlikely
14069 // to interfere with addressing mode patterns.
14070
14071 SDValue Hi = getHiHalf64(LHS, DAG);
14072 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14073 SDValue AddHi =
14074 DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14075
14076 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14077 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14078 }
14079
14080 return SDValue();
14081}
14082
14083// Collect the ultimate src of each of the mul node's operands, and confirm
14084// each operand is 8 bytes.
14085static std::optional<ByteProvider<SDValue>>
14086handleMulOperand(const SDValue &MulOperand) {
14087 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14088 if (!Byte0 || Byte0->isConstantZero()) {
14089 return std::nullopt;
14090 }
14091 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14092 if (Byte1 && !Byte1->isConstantZero()) {
14093 return std::nullopt;
14094 }
14095 return Byte0;
14096}
14097
14098static unsigned addPermMasks(unsigned First, unsigned Second) {
14099 unsigned FirstCs = First & 0x0c0c0c0c;
14100 unsigned SecondCs = Second & 0x0c0c0c0c;
14101 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14102 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14103
14104 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14105 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14106 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14107 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14108
14109 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14110}
14111
14112struct DotSrc {
14114 int64_t PermMask;
14116};
14117
14121 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14122
14123 assert(Src0.Src.has_value() && Src1.Src.has_value());
14124 // Src0s and Src1s are empty, just place arbitrarily.
14125 if (Step == 0) {
14126 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14127 Src0.SrcOffset / 4});
14128 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14129 Src1.SrcOffset / 4});
14130 return;
14131 }
14132
14133 for (int BPI = 0; BPI < 2; BPI++) {
14134 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14135 if (BPI == 1) {
14136 BPP = {Src1, Src0};
14137 }
14138 unsigned ZeroMask = 0x0c0c0c0c;
14139 unsigned FMask = 0xFF << (8 * (3 - Step));
14140
14141 unsigned FirstMask =
14142 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14143 unsigned SecondMask =
14144 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14145 // Attempt to find Src vector which contains our SDValue, if so, add our
14146 // perm mask to the existing one. If we are unable to find a match for the
14147 // first SDValue, attempt to find match for the second.
14148 int FirstGroup = -1;
14149 for (int I = 0; I < 2; I++) {
14150 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14151 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14152 return IterElt.SrcOp == *BPP.first.Src &&
14153 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14154 };
14155
14156 auto *Match = llvm::find_if(Srcs, MatchesFirst);
14157 if (Match != Srcs.end()) {
14158 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14159 FirstGroup = I;
14160 break;
14161 }
14162 }
14163 if (FirstGroup != -1) {
14164 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14165 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14166 return IterElt.SrcOp == *BPP.second.Src &&
14167 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14168 };
14169 auto *Match = llvm::find_if(Srcs, MatchesSecond);
14170 if (Match != Srcs.end()) {
14171 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14172 } else
14173 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14174 return;
14175 }
14176 }
14177
14178 // If we have made it here, then we could not find a match in Src0s or Src1s
14179 // for either Src0 or Src1, so just place them arbitrarily.
14180
14181 unsigned ZeroMask = 0x0c0c0c0c;
14182 unsigned FMask = 0xFF << (8 * (3 - Step));
14183
14184 Src0s.push_back(
14185 {*Src0.Src,
14186 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14187 Src0.SrcOffset / 4});
14188 Src1s.push_back(
14189 {*Src1.Src,
14190 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14191 Src1.SrcOffset / 4});
14192}
14193
14195 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14196 bool IsAny) {
14197
14198 // If we just have one source, just permute it accordingly.
14199 if (Srcs.size() == 1) {
14200 auto *Elt = Srcs.begin();
14201 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14202
14203 // v_perm will produce the original value
14204 if (Elt->PermMask == 0x3020100)
14205 return EltOp;
14206
14207 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14208 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14209 }
14210
14211 auto *FirstElt = Srcs.begin();
14212 auto *SecondElt = std::next(FirstElt);
14213
14215
14216 // If we have multiple sources in the chain, combine them via perms (using
14217 // calculated perm mask) and Ors.
14218 while (true) {
14219 auto FirstMask = FirstElt->PermMask;
14220 auto SecondMask = SecondElt->PermMask;
14221
14222 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14223 unsigned FirstPlusFour = FirstMask | 0x04040404;
14224 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14225 // original 0x0C.
14226 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14227
14228 auto PermMask = addPermMasks(FirstMask, SecondMask);
14229 auto FirstVal =
14230 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14231 auto SecondVal =
14232 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14233
14234 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14235 SecondVal,
14236 DAG.getConstant(PermMask, SL, MVT::i32)));
14237
14238 FirstElt = std::next(SecondElt);
14239 if (FirstElt == Srcs.end())
14240 break;
14241
14242 SecondElt = std::next(FirstElt);
14243 // If we only have a FirstElt, then just combine that into the cumulative
14244 // source node.
14245 if (SecondElt == Srcs.end()) {
14246 auto EltOp =
14247 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14248
14249 Perms.push_back(
14250 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14251 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14252 break;
14253 }
14254 }
14255
14256 assert(Perms.size() == 1 || Perms.size() == 2);
14257 return Perms.size() == 2
14258 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14259 : Perms[0];
14260}
14261
14262static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14263 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14264 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14265 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14266 EntryMask += ZeroMask;
14267 }
14268}
14269
14270static bool isMul(const SDValue Op) {
14271 auto Opcode = Op.getOpcode();
14272
14273 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14274 Opcode == AMDGPUISD::MUL_I24);
14275}
14276
14277static std::optional<bool>
14279 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14280 const SDValue &S1Op, const SelectionDAG &DAG) {
14281 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14282 // of the dot4 is irrelevant.
14283 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14284 return false;
14285
14286 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14287 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14288 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14289 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14290 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14291 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14292
14293 assert(!(S0IsUnsigned && S0IsSigned));
14294 assert(!(S1IsUnsigned && S1IsSigned));
14295
14296 // There are 9 possible permutations of
14297 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14298
14299 // In two permutations, the sign bits are known to be the same for both Ops,
14300 // so simply return Signed / Unsigned corresponding to the MSB
14301
14302 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14303 return S0IsSigned;
14304
14305 // In another two permutations, the sign bits are known to be opposite. In
14306 // this case return std::nullopt to indicate a bad match.
14307
14308 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14309 return std::nullopt;
14310
14311 // In the remaining five permutations, we don't know the value of the sign
14312 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14313 // the upper bits must be extension bits. Thus, the only ways for the sign
14314 // bit to be unknown is if it was sign extended from unknown value, or if it
14315 // was any extended. In either case, it is correct to use the signed
14316 // version of the signedness semantics of dot4
14317
14318 // In two of such permutations, we known the sign bit is set for
14319 // one op, and the other is unknown. It is okay to used signed version of
14320 // dot4.
14321 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14322 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14323 return true;
14324
14325 // In one such permutation, we don't know either of the sign bits. It is okay
14326 // to used the signed version of dot4.
14327 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14328 return true;
14329
14330 // In two of such permutations, we known the sign bit is unset for
14331 // one op, and the other is unknown. Return std::nullopt to indicate a
14332 // bad match.
14333 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14334 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14335 return std::nullopt;
14336
14337 llvm_unreachable("Fully covered condition");
14338}
14339
14340SDValue SITargetLowering::performAddCombine(SDNode *N,
14341 DAGCombinerInfo &DCI) const {
14342 SelectionDAG &DAG = DCI.DAG;
14343 EVT VT = N->getValueType(0);
14344 SDLoc SL(N);
14345 SDValue LHS = N->getOperand(0);
14346 SDValue RHS = N->getOperand(1);
14347
14348 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14349 if (Subtarget->hasMad64_32()) {
14350 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14351 return Folded;
14352 }
14353 }
14354
14355 if (SDValue V = reassociateScalarOps(N, DAG)) {
14356 return V;
14357 }
14358
14359 if (VT == MVT::i64) {
14360 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14361 return Folded;
14362 }
14363
14364 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14365 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14366 SDValue TempNode(N, 0);
14367 std::optional<bool> IsSigned;
14371
14372 // Match the v_dot4 tree, while collecting src nodes.
14373 int ChainLength = 0;
14374 for (int I = 0; I < 4; I++) {
14375 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14376 if (MulIdx == -1)
14377 break;
14378 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14379 if (!Src0)
14380 break;
14381 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14382 if (!Src1)
14383 break;
14384
14385 auto IterIsSigned = checkDot4MulSignedness(
14386 TempNode->getOperand(MulIdx), *Src0, *Src1,
14387 TempNode->getOperand(MulIdx)->getOperand(0),
14388 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14389 if (!IterIsSigned)
14390 break;
14391 if (!IsSigned)
14392 IsSigned = *IterIsSigned;
14393 if (*IterIsSigned != *IsSigned)
14394 break;
14395 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14396 auto AddIdx = 1 - MulIdx;
14397 // Allow the special case where add (add (mul24, 0), mul24) became ->
14398 // add (mul24, mul24).
14399 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14400 Src2s.push_back(TempNode->getOperand(AddIdx));
14401 auto Src0 =
14402 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14403 if (!Src0)
14404 break;
14405 auto Src1 =
14406 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14407 if (!Src1)
14408 break;
14409 auto IterIsSigned = checkDot4MulSignedness(
14410 TempNode->getOperand(AddIdx), *Src0, *Src1,
14411 TempNode->getOperand(AddIdx)->getOperand(0),
14412 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14413 if (!IterIsSigned)
14414 break;
14415 assert(IsSigned);
14416 if (*IterIsSigned != *IsSigned)
14417 break;
14418 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14419 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14420 ChainLength = I + 2;
14421 break;
14422 }
14423
14424 TempNode = TempNode->getOperand(AddIdx);
14425 Src2s.push_back(TempNode);
14426 ChainLength = I + 1;
14427 if (TempNode->getNumOperands() < 2)
14428 break;
14429 LHS = TempNode->getOperand(0);
14430 RHS = TempNode->getOperand(1);
14431 }
14432
14433 if (ChainLength < 2)
14434 return SDValue();
14435
14436 // Masks were constructed with assumption that we would find a chain of
14437 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14438 // 0x0c) so they do not affect dot calculation.
14439 if (ChainLength < 4) {
14440 fixMasks(Src0s, ChainLength);
14441 fixMasks(Src1s, ChainLength);
14442 }
14443
14444 SDValue Src0, Src1;
14445
14446 // If we are just using a single source for both, and have permuted the
14447 // bytes consistently, we can just use the sources without permuting
14448 // (commutation).
14449 bool UseOriginalSrc = false;
14450 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14451 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14452 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14453 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14454 SmallVector<unsigned, 4> SrcBytes;
14455 auto Src0Mask = Src0s.begin()->PermMask;
14456 SrcBytes.push_back(Src0Mask & 0xFF000000);
14457 bool UniqueEntries = true;
14458 for (auto I = 1; I < 4; I++) {
14459 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14460
14461 if (is_contained(SrcBytes, NextByte)) {
14462 UniqueEntries = false;
14463 break;
14464 }
14465 SrcBytes.push_back(NextByte);
14466 }
14467
14468 if (UniqueEntries) {
14469 UseOriginalSrc = true;
14470
14471 auto *FirstElt = Src0s.begin();
14472 auto FirstEltOp =
14473 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14474
14475 auto *SecondElt = Src1s.begin();
14476 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14477 SecondElt->DWordOffset);
14478
14479 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14480 MVT::getIntegerVT(32));
14481 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14482 MVT::getIntegerVT(32));
14483 }
14484 }
14485
14486 if (!UseOriginalSrc) {
14487 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14488 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14489 }
14490
14491 assert(IsSigned);
14492 SDValue Src2 =
14493 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14494
14495 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14496 : Intrinsic::amdgcn_udot4,
14497 SL, MVT::i64);
14498
14499 assert(!VT.isVector());
14500 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14501 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14502
14503 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14504 }
14505
14506 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14507 return SDValue();
14508
14509 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14510 // add x, sext (setcc) => usubo_carry x, 0, setcc
14511 unsigned Opc = LHS.getOpcode();
14512 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14513 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14514 std::swap(RHS, LHS);
14515
14516 Opc = RHS.getOpcode();
14517 switch (Opc) {
14518 default:
14519 break;
14520 case ISD::ZERO_EXTEND:
14521 case ISD::SIGN_EXTEND:
14522 case ISD::ANY_EXTEND: {
14523 auto Cond = RHS.getOperand(0);
14524 // If this won't be a real VOPC output, we would still need to insert an
14525 // extra instruction anyway.
14526 if (!isBoolSGPR(Cond))
14527 break;
14528 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14529 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14531 return DAG.getNode(Opc, SL, VTList, Args);
14532 }
14533 case ISD::UADDO_CARRY: {
14534 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14535 if (!isNullConstant(RHS.getOperand(1)))
14536 break;
14537 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
14538 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14539 }
14540 }
14541 return SDValue();
14542}
14543
14544SDValue SITargetLowering::performSubCombine(SDNode *N,
14545 DAGCombinerInfo &DCI) const {
14546 SelectionDAG &DAG = DCI.DAG;
14547 EVT VT = N->getValueType(0);
14548
14549 if (VT == MVT::i64) {
14550 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14551 return Folded;
14552 }
14553
14554 if (VT != MVT::i32)
14555 return SDValue();
14556
14557 SDLoc SL(N);
14558 SDValue LHS = N->getOperand(0);
14559 SDValue RHS = N->getOperand(1);
14560
14561 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14562 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14563 unsigned Opc = RHS.getOpcode();
14564 switch (Opc) {
14565 default:
14566 break;
14567 case ISD::ZERO_EXTEND:
14568 case ISD::SIGN_EXTEND:
14569 case ISD::ANY_EXTEND: {
14570 auto Cond = RHS.getOperand(0);
14571 // If this won't be a real VOPC output, we would still need to insert an
14572 // extra instruction anyway.
14573 if (!isBoolSGPR(Cond))
14574 break;
14575 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14576 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14578 return DAG.getNode(Opc, SL, VTList, Args);
14579 }
14580 }
14581
14582 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14583 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14584 if (!isNullConstant(LHS.getOperand(1)))
14585 return SDValue();
14586 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
14587 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14588 }
14589 return SDValue();
14590}
14591
14592SDValue
14593SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14594 DAGCombinerInfo &DCI) const {
14595
14596 if (N->getValueType(0) != MVT::i32)
14597 return SDValue();
14598
14599 if (!isNullConstant(N->getOperand(1)))
14600 return SDValue();
14601
14602 SelectionDAG &DAG = DCI.DAG;
14603 SDValue LHS = N->getOperand(0);
14604
14605 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14606 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14607 unsigned LHSOpc = LHS.getOpcode();
14608 unsigned Opc = N->getOpcode();
14609 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14610 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14611 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
14612 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14613 }
14614 return SDValue();
14615}
14616
14617SDValue SITargetLowering::performFAddCombine(SDNode *N,
14618 DAGCombinerInfo &DCI) const {
14619 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14620 return SDValue();
14621
14622 SelectionDAG &DAG = DCI.DAG;
14623 EVT VT = N->getValueType(0);
14624
14625 SDLoc SL(N);
14626 SDValue LHS = N->getOperand(0);
14627 SDValue RHS = N->getOperand(1);
14628
14629 // These should really be instruction patterns, but writing patterns with
14630 // source modifiers is a pain.
14631
14632 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14633 if (LHS.getOpcode() == ISD::FADD) {
14634 SDValue A = LHS.getOperand(0);
14635 if (A == LHS.getOperand(1)) {
14636 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14637 if (FusedOp != 0) {
14638 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14639 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14640 }
14641 }
14642 }
14643
14644 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14645 if (RHS.getOpcode() == ISD::FADD) {
14646 SDValue A = RHS.getOperand(0);
14647 if (A == RHS.getOperand(1)) {
14648 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14649 if (FusedOp != 0) {
14650 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14651 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14652 }
14653 }
14654 }
14655
14656 return SDValue();
14657}
14658
14659SDValue SITargetLowering::performFSubCombine(SDNode *N,
14660 DAGCombinerInfo &DCI) const {
14661 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14662 return SDValue();
14663
14664 SelectionDAG &DAG = DCI.DAG;
14665 SDLoc SL(N);
14666 EVT VT = N->getValueType(0);
14667 assert(!VT.isVector());
14668
14669 // Try to get the fneg to fold into the source modifier. This undoes generic
14670 // DAG combines and folds them into the mad.
14671 //
14672 // Only do this if we are not trying to support denormals. v_mad_f32 does
14673 // not support denormals ever.
14674 SDValue LHS = N->getOperand(0);
14675 SDValue RHS = N->getOperand(1);
14676 if (LHS.getOpcode() == ISD::FADD) {
14677 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14678 SDValue A = LHS.getOperand(0);
14679 if (A == LHS.getOperand(1)) {
14680 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14681 if (FusedOp != 0) {
14682 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14683 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14684
14685 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14686 }
14687 }
14688 }
14689
14690 if (RHS.getOpcode() == ISD::FADD) {
14691 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14692
14693 SDValue A = RHS.getOperand(0);
14694 if (A == RHS.getOperand(1)) {
14695 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14696 if (FusedOp != 0) {
14697 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14698 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14699 }
14700 }
14701 }
14702
14703 return SDValue();
14704}
14705
14706SDValue SITargetLowering::performFDivCombine(SDNode *N,
14707 DAGCombinerInfo &DCI) const {
14708 SelectionDAG &DAG = DCI.DAG;
14709 SDLoc SL(N);
14710 EVT VT = N->getValueType(0);
14711 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14712 return SDValue();
14713
14714 SDValue LHS = N->getOperand(0);
14715 SDValue RHS = N->getOperand(1);
14716
14717 SDNodeFlags Flags = N->getFlags();
14718 SDNodeFlags RHSFlags = RHS->getFlags();
14719 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14720 !RHS->hasOneUse())
14721 return SDValue();
14722
14723 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14724 bool IsNegative = false;
14725 if (CLHS->isExactlyValue(1.0) ||
14726 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14727 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14728 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14729 if (RHS.getOpcode() == ISD::FSQRT) {
14730 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14731 SDValue Rsq =
14732 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14733 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14734 }
14735 }
14736 }
14737
14738 return SDValue();
14739}
14740
14741SDValue SITargetLowering::performFMulCombine(SDNode *N,
14742 DAGCombinerInfo &DCI) const {
14743 SelectionDAG &DAG = DCI.DAG;
14744 EVT VT = N->getValueType(0);
14745 EVT ScalarVT = VT.getScalarType();
14746 EVT IntVT = VT.changeElementType(MVT::i32);
14747
14748 SDValue LHS = N->getOperand(0);
14749 SDValue RHS = N->getOperand(1);
14750
14751 // It is cheaper to realize i32 inline constants as compared against
14752 // materializing f16 or f64 (or even non-inline f32) values,
14753 // possible via ldexp usage, as shown below :
14754 //
14755 // Given : A = 2^a & B = 2^b ; where a and b are integers.
14756 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
14757 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
14758 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14759 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14760 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14761 if (!TrueNode)
14762 return SDValue();
14763 const ConstantFPSDNode *FalseNode =
14764 isConstOrConstSplatFP(RHS.getOperand(2));
14765 if (!FalseNode)
14766 return SDValue();
14767
14768 if (TrueNode->isNegative() != FalseNode->isNegative())
14769 return SDValue();
14770
14771 // For f32, only non-inline constants should be transformed.
14773 if (ScalarVT == MVT::f32 &&
14774 TII->isInlineConstant(TrueNode->getValueAPF()) &&
14775 TII->isInlineConstant(FalseNode->getValueAPF()))
14776 return SDValue();
14777
14778 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14779 if (TrueNodeExpVal == INT_MIN)
14780 return SDValue();
14781 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14782 if (FalseNodeExpVal == INT_MIN)
14783 return SDValue();
14784
14785 SDLoc SL(N);
14786 SDValue SelectNode =
14787 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14788 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14789 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14790
14791 LHS = TrueNode->isNegative()
14792 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14793 : LHS;
14794
14795 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14796 }
14797
14798 return SDValue();
14799}
14800
14801SDValue SITargetLowering::performFMACombine(SDNode *N,
14802 DAGCombinerInfo &DCI) const {
14803 SelectionDAG &DAG = DCI.DAG;
14804 EVT VT = N->getValueType(0);
14805 SDLoc SL(N);
14806
14807 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
14808 return SDValue();
14809
14810 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14811 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14812 SDValue Op1 = N->getOperand(0);
14813 SDValue Op2 = N->getOperand(1);
14814 SDValue FMA = N->getOperand(2);
14815
14816 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
14817 Op2.getOpcode() != ISD::FP_EXTEND)
14818 return SDValue();
14819
14820 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14821 // regardless of the denorm mode setting. Therefore,
14822 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14823 const TargetOptions &Options = DAG.getTarget().Options;
14824 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14825 (N->getFlags().hasAllowContract() &&
14826 FMA->getFlags().hasAllowContract())) {
14827 Op1 = Op1.getOperand(0);
14828 Op2 = Op2.getOperand(0);
14829 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14831 return SDValue();
14832
14833 SDValue Vec1 = Op1.getOperand(0);
14834 SDValue Idx1 = Op1.getOperand(1);
14835 SDValue Vec2 = Op2.getOperand(0);
14836
14837 SDValue FMAOp1 = FMA.getOperand(0);
14838 SDValue FMAOp2 = FMA.getOperand(1);
14839 SDValue FMAAcc = FMA.getOperand(2);
14840
14841 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14842 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14843 return SDValue();
14844
14845 FMAOp1 = FMAOp1.getOperand(0);
14846 FMAOp2 = FMAOp2.getOperand(0);
14847 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14849 return SDValue();
14850
14851 SDValue Vec3 = FMAOp1.getOperand(0);
14852 SDValue Vec4 = FMAOp2.getOperand(0);
14853 SDValue Idx2 = FMAOp1.getOperand(1);
14854
14855 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14856 // Idx1 and Idx2 cannot be the same.
14857 Idx1 == Idx2)
14858 return SDValue();
14859
14860 if (Vec1 == Vec2 || Vec3 == Vec4)
14861 return SDValue();
14862
14863 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14864 return SDValue();
14865
14866 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14867 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14868 DAG.getTargetConstant(0, SL, MVT::i1));
14869 }
14870 }
14871 return SDValue();
14872}
14873
14874SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14875 DAGCombinerInfo &DCI) const {
14876 SelectionDAG &DAG = DCI.DAG;
14877 SDLoc SL(N);
14878
14879 SDValue LHS = N->getOperand(0);
14880 SDValue RHS = N->getOperand(1);
14881 EVT VT = LHS.getValueType();
14882 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14883
14884 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14885 if (!CRHS) {
14886 CRHS = dyn_cast<ConstantSDNode>(LHS);
14887 if (CRHS) {
14888 std::swap(LHS, RHS);
14890 }
14891 }
14892
14893 if (CRHS) {
14894 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14895 isBoolSGPR(LHS.getOperand(0))) {
14896 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14897 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14898 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14899 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14900 if ((CRHS->isAllOnes() &&
14901 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14902 (CRHS->isZero() &&
14903 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14904 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14905 DAG.getAllOnesConstant(SL, MVT::i1));
14906 if ((CRHS->isAllOnes() &&
14907 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14908 (CRHS->isZero() &&
14909 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14910 return LHS.getOperand(0);
14911 }
14912
14913 const APInt &CRHSVal = CRHS->getAPIntValue();
14914 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14915 LHS.getOpcode() == ISD::SELECT &&
14916 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14917 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14918 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14919 isBoolSGPR(LHS.getOperand(0))) {
14920 // Given CT != FT:
14921 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14922 // setcc (select cc, CT, CF), CF, ne => cc
14923 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14924 // setcc (select cc, CT, CF), CT, eq => cc
14925 const APInt &CT = LHS.getConstantOperandAPInt(1);
14926 const APInt &CF = LHS.getConstantOperandAPInt(2);
14927
14928 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14929 (CT == CRHSVal && CC == ISD::SETNE))
14930 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14931 DAG.getAllOnesConstant(SL, MVT::i1));
14932 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14933 (CT == CRHSVal && CC == ISD::SETEQ))
14934 return LHS.getOperand(0);
14935 }
14936 }
14937
14938 if (VT != MVT::f32 && VT != MVT::f64 &&
14939 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14940 return SDValue();
14941
14942 // Match isinf/isfinite pattern
14943 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14944 // (fcmp one (fabs x), inf) -> (fp_class x,
14945 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14946 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
14947 LHS.getOpcode() == ISD::FABS) {
14948 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14949 if (!CRHS)
14950 return SDValue();
14951
14952 const APFloat &APF = CRHS->getValueAPF();
14953 if (APF.isInfinity() && !APF.isNegative()) {
14954 const unsigned IsInfMask =
14956 const unsigned IsFiniteMask =
14960 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14961 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14962 DAG.getConstant(Mask, SL, MVT::i32));
14963 }
14964 }
14965
14966 return SDValue();
14967}
14968
14969SDValue
14970SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14971 DAGCombinerInfo &DCI) const {
14972 SelectionDAG &DAG = DCI.DAG;
14973 SDLoc SL(N);
14974 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14975
14976 SDValue Src = N->getOperand(0);
14977 SDValue Shift = N->getOperand(0);
14978
14979 // TODO: Extend type shouldn't matter (assuming legal types).
14980 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14981 Shift = Shift.getOperand(0);
14982
14983 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14984 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14985 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14986 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14987 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14988 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14989 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14990 SDValue Shifted = DAG.getZExtOrTrunc(
14991 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
14992
14993 unsigned ShiftOffset = 8 * Offset;
14994 if (Shift.getOpcode() == ISD::SHL)
14995 ShiftOffset -= C->getZExtValue();
14996 else
14997 ShiftOffset += C->getZExtValue();
14998
14999 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15000 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
15001 MVT::f32, Shifted);
15002 }
15003 }
15004 }
15005
15006 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15007 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
15008 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
15009 // We simplified Src. If this node is not dead, visit it again so it is
15010 // folded properly.
15011 if (N->getOpcode() != ISD::DELETED_NODE)
15012 DCI.AddToWorklist(N);
15013 return SDValue(N, 0);
15014 }
15015
15016 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
15017 if (SDValue DemandedSrc =
15019 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
15020
15021 return SDValue();
15022}
15023
15024SDValue SITargetLowering::performClampCombine(SDNode *N,
15025 DAGCombinerInfo &DCI) const {
15026 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
15027 if (!CSrc)
15028 return SDValue();
15029
15030 const MachineFunction &MF = DCI.DAG.getMachineFunction();
15031 const APFloat &F = CSrc->getValueAPF();
15032 APFloat Zero = APFloat::getZero(F.getSemantics());
15033 if (F < Zero ||
15034 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
15035 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
15036 }
15037
15038 APFloat One(F.getSemantics(), "1.0");
15039 if (F > One)
15040 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
15041
15042 return SDValue(CSrc, 0);
15043}
15044
15046 DAGCombinerInfo &DCI) const {
15047 switch (N->getOpcode()) {
15048 case ISD::ADD:
15049 case ISD::SUB:
15050 case ISD::SHL:
15051 case ISD::SRL:
15052 case ISD::SRA:
15053 case ISD::AND:
15054 case ISD::OR:
15055 case ISD::XOR:
15056 case ISD::MUL:
15057 case ISD::SETCC:
15058 case ISD::SELECT:
15059 case ISD::SMIN:
15060 case ISD::SMAX:
15061 case ISD::UMIN:
15062 case ISD::UMAX:
15063 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
15064 return Res;
15065 break;
15066 default:
15067 break;
15068 }
15069
15070 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15071 return SDValue();
15072
15073 switch (N->getOpcode()) {
15074 case ISD::ADD:
15075 return performAddCombine(N, DCI);
15076 case ISD::SUB:
15077 return performSubCombine(N, DCI);
15078 case ISD::UADDO_CARRY:
15079 case ISD::USUBO_CARRY:
15080 return performAddCarrySubCarryCombine(N, DCI);
15081 case ISD::FADD:
15082 return performFAddCombine(N, DCI);
15083 case ISD::FSUB:
15084 return performFSubCombine(N, DCI);
15085 case ISD::FDIV:
15086 return performFDivCombine(N, DCI);
15087 case ISD::FMUL:
15088 return performFMulCombine(N, DCI);
15089 case ISD::SETCC:
15090 return performSetCCCombine(N, DCI);
15091 case ISD::FMAXNUM:
15092 case ISD::FMINNUM:
15093 case ISD::FMAXNUM_IEEE:
15094 case ISD::FMINNUM_IEEE:
15095 case ISD::FMAXIMUM:
15096 case ISD::FMINIMUM:
15097 case ISD::SMAX:
15098 case ISD::SMIN:
15099 case ISD::UMAX:
15100 case ISD::UMIN:
15103 return performMinMaxCombine(N, DCI);
15104 case ISD::FMA:
15105 return performFMACombine(N, DCI);
15106 case ISD::AND:
15107 return performAndCombine(N, DCI);
15108 case ISD::OR:
15109 return performOrCombine(N, DCI);
15110 case ISD::FSHR: {
15112 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15113 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15114 return matchPERM(N, DCI);
15115 }
15116 break;
15117 }
15118 case ISD::XOR:
15119 return performXorCombine(N, DCI);
15120 case ISD::ZERO_EXTEND:
15121 return performZeroExtendCombine(N, DCI);
15123 return performSignExtendInRegCombine(N, DCI);
15125 return performClassCombine(N, DCI);
15126 case ISD::FCANONICALIZE:
15127 return performFCanonicalizeCombine(N, DCI);
15128 case AMDGPUISD::RCP:
15129 return performRcpCombine(N, DCI);
15130 case ISD::FLDEXP:
15131 case AMDGPUISD::FRACT:
15132 case AMDGPUISD::RSQ:
15135 case AMDGPUISD::RSQ_CLAMP: {
15136 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15137 SDValue Src = N->getOperand(0);
15138 if (Src.isUndef())
15139 return Src;
15140 break;
15141 }
15142 case ISD::SINT_TO_FP:
15143 case ISD::UINT_TO_FP:
15144 return performUCharToFloatCombine(N, DCI);
15145 case ISD::FCOPYSIGN:
15146 return performFCopySignCombine(N, DCI);
15151 return performCvtF32UByteNCombine(N, DCI);
15152 case AMDGPUISD::FMED3:
15153 return performFMed3Combine(N, DCI);
15155 return performCvtPkRTZCombine(N, DCI);
15156 case AMDGPUISD::CLAMP:
15157 return performClampCombine(N, DCI);
15158 case ISD::SCALAR_TO_VECTOR: {
15159 SelectionDAG &DAG = DCI.DAG;
15160 EVT VT = N->getValueType(0);
15161
15162 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15163 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15164 SDLoc SL(N);
15165 SDValue Src = N->getOperand(0);
15166 EVT EltVT = Src.getValueType();
15167 if (EltVT != MVT::i16)
15168 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15169
15170 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15171 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15172 }
15173
15174 break;
15175 }
15177 return performExtractVectorEltCombine(N, DCI);
15179 return performInsertVectorEltCombine(N, DCI);
15180 case ISD::FP_ROUND:
15181 return performFPRoundCombine(N, DCI);
15182 case ISD::LOAD: {
15183 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15184 return Widened;
15185 [[fallthrough]];
15186 }
15187 default: {
15188 if (!DCI.isBeforeLegalize()) {
15189 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15190 return performMemSDNodeCombine(MemNode, DCI);
15191 }
15192
15193 break;
15194 }
15195 }
15196
15198}
15199
15200/// Helper function for adjustWritemask
15201static unsigned SubIdx2Lane(unsigned Idx) {
15202 switch (Idx) {
15203 default:
15204 return ~0u;
15205 case AMDGPU::sub0:
15206 return 0;
15207 case AMDGPU::sub1:
15208 return 1;
15209 case AMDGPU::sub2:
15210 return 2;
15211 case AMDGPU::sub3:
15212 return 3;
15213 case AMDGPU::sub4:
15214 return 4; // Possible with TFE/LWE
15215 }
15216}
15217
15218/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15219SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15220 SelectionDAG &DAG) const {
15221 unsigned Opcode = Node->getMachineOpcode();
15222
15223 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15224 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15225 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15226 return Node; // not implemented for D16
15227
15228 SDNode *Users[5] = {nullptr};
15229 unsigned Lane = 0;
15230 unsigned DmaskIdx =
15231 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15232 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15233 unsigned NewDmask = 0;
15234 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15235 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15236 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15237 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
15238 ? true
15239 : false;
15240 unsigned TFCLane = 0;
15241 bool HasChain = Node->getNumValues() > 1;
15242
15243 if (OldDmask == 0) {
15244 // These are folded out, but on the chance it happens don't assert.
15245 return Node;
15246 }
15247
15248 unsigned OldBitsSet = llvm::popcount(OldDmask);
15249 // Work out which is the TFE/LWE lane if that is enabled.
15250 if (UsesTFC) {
15251 TFCLane = OldBitsSet;
15252 }
15253
15254 // Try to figure out the used register components
15255 for (SDUse &Use : Node->uses()) {
15256
15257 // Don't look at users of the chain.
15258 if (Use.getResNo() != 0)
15259 continue;
15260
15261 SDNode *User = Use.getUser();
15262
15263 // Abort if we can't understand the usage
15264 if (!User->isMachineOpcode() ||
15265 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15266 return Node;
15267
15268 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15269 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15270 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15271 // set, etc.
15272 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15273 if (Lane == ~0u)
15274 return Node;
15275
15276 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15277 if (UsesTFC && Lane == TFCLane) {
15278 Users[Lane] = User;
15279 } else {
15280 // Set which texture component corresponds to the lane.
15281 unsigned Comp;
15282 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15283 Comp = llvm::countr_zero(Dmask);
15284 Dmask &= ~(1 << Comp);
15285 }
15286
15287 // Abort if we have more than one user per component.
15288 if (Users[Lane])
15289 return Node;
15290
15291 Users[Lane] = User;
15292 NewDmask |= 1 << Comp;
15293 }
15294 }
15295
15296 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15297 bool NoChannels = !NewDmask;
15298 if (NoChannels) {
15299 if (!UsesTFC) {
15300 // No uses of the result and not using TFC. Then do nothing.
15301 return Node;
15302 }
15303 // If the original dmask has one channel - then nothing to do
15304 if (OldBitsSet == 1)
15305 return Node;
15306 // Use an arbitrary dmask - required for the instruction to work
15307 NewDmask = 1;
15308 }
15309 // Abort if there's no change
15310 if (NewDmask == OldDmask)
15311 return Node;
15312
15313 unsigned BitsSet = llvm::popcount(NewDmask);
15314
15315 // Check for TFE or LWE - increase the number of channels by one to account
15316 // for the extra return value
15317 // This will need adjustment for D16 if this is also included in
15318 // adjustWriteMask (this function) but at present D16 are excluded.
15319 unsigned NewChannels = BitsSet + UsesTFC;
15320
15321 int NewOpcode =
15322 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15323 assert(NewOpcode != -1 &&
15324 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15325 "failed to find equivalent MIMG op");
15326
15327 // Adjust the writemask in the node
15329 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15330 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15331 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15332
15333 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15334
15335 MVT ResultVT = NewChannels == 1
15336 ? SVT
15337 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
15338 : NewChannels == 5 ? 8
15339 : NewChannels);
15340 SDVTList NewVTList =
15341 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15342
15343 MachineSDNode *NewNode =
15344 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15345
15346 if (HasChain) {
15347 // Update chain.
15348 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15349 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15350 }
15351
15352 if (NewChannels == 1) {
15353 assert(Node->hasNUsesOfValue(1, 0));
15354 SDNode *Copy =
15355 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15356 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15357 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15358 return nullptr;
15359 }
15360
15361 // Update the users of the node with the new indices
15362 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15363 SDNode *User = Users[i];
15364 if (!User) {
15365 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15366 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15367 if (i || !NoChannels)
15368 continue;
15369 } else {
15370 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15371 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15372 if (NewUser != User) {
15373 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15374 DAG.RemoveDeadNode(User);
15375 }
15376 }
15377
15378 switch (Idx) {
15379 default:
15380 break;
15381 case AMDGPU::sub0:
15382 Idx = AMDGPU::sub1;
15383 break;
15384 case AMDGPU::sub1:
15385 Idx = AMDGPU::sub2;
15386 break;
15387 case AMDGPU::sub2:
15388 Idx = AMDGPU::sub3;
15389 break;
15390 case AMDGPU::sub3:
15391 Idx = AMDGPU::sub4;
15392 break;
15393 }
15394 }
15395
15396 DAG.RemoveDeadNode(Node);
15397 return nullptr;
15398}
15399
15401 if (Op.getOpcode() == ISD::AssertZext)
15402 Op = Op.getOperand(0);
15403
15404 return isa<FrameIndexSDNode>(Op);
15405}
15406
15407/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15408/// with frame index operands.
15409/// LLVM assumes that inputs are to these instructions are registers.
15410SDNode *
15412 SelectionDAG &DAG) const {
15413 if (Node->getOpcode() == ISD::CopyToReg) {
15414 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15415 SDValue SrcVal = Node->getOperand(2);
15416
15417 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15418 // to try understanding copies to physical registers.
15419 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15420 SDLoc SL(Node);
15422 SDValue VReg = DAG.getRegister(
15423 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15424
15425 SDNode *Glued = Node->getGluedNode();
15426 SDValue ToVReg = DAG.getCopyToReg(
15427 Node->getOperand(0), SL, VReg, SrcVal,
15428 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15429 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15430 VReg, ToVReg.getValue(1));
15431 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15432 DAG.RemoveDeadNode(Node);
15433 return ToResultReg.getNode();
15434 }
15435 }
15436
15438 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15439 if (!isFrameIndexOp(Node->getOperand(i))) {
15440 Ops.push_back(Node->getOperand(i));
15441 continue;
15442 }
15443
15444 SDLoc DL(Node);
15445 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15446 Node->getOperand(i).getValueType(),
15447 Node->getOperand(i)),
15448 0));
15449 }
15450
15451 return DAG.UpdateNodeOperands(Node, Ops);
15452}
15453
15454/// Fold the instructions after selecting them.
15455/// Returns null if users were already updated.
15457 SelectionDAG &DAG) const {
15459 unsigned Opcode = Node->getMachineOpcode();
15460
15461 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15462 !TII->isGather4(Opcode) &&
15463 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15464 return adjustWritemask(Node, DAG);
15465 }
15466
15467 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15469 return Node;
15470 }
15471
15472 switch (Opcode) {
15473 case AMDGPU::V_DIV_SCALE_F32_e64:
15474 case AMDGPU::V_DIV_SCALE_F64_e64: {
15475 // Satisfy the operand register constraint when one of the inputs is
15476 // undefined. Ordinarily each undef value will have its own implicit_def of
15477 // a vreg, so force these to use a single register.
15478 SDValue Src0 = Node->getOperand(1);
15479 SDValue Src1 = Node->getOperand(3);
15480 SDValue Src2 = Node->getOperand(5);
15481
15482 if ((Src0.isMachineOpcode() &&
15483 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15484 (Src0 == Src1 || Src0 == Src2))
15485 break;
15486
15487 MVT VT = Src0.getValueType().getSimpleVT();
15488 const TargetRegisterClass *RC =
15489 getRegClassFor(VT, Src0.getNode()->isDivergent());
15490
15492 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15493
15494 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
15495 Src0, SDValue());
15496
15497 // src0 must be the same register as src1 or src2, even if the value is
15498 // undefined, so make sure we don't violate this constraint.
15499 if (Src0.isMachineOpcode() &&
15500 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15501 if (Src1.isMachineOpcode() &&
15502 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15503 Src0 = Src1;
15504 else if (Src2.isMachineOpcode() &&
15505 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15506 Src0 = Src2;
15507 else {
15508 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15509 Src0 = UndefReg;
15510 Src1 = UndefReg;
15511 }
15512 } else
15513 break;
15514
15515 SmallVector<SDValue, 9> Ops(Node->ops());
15516 Ops[1] = Src0;
15517 Ops[3] = Src1;
15518 Ops[5] = Src2;
15519 Ops.push_back(ImpDef.getValue(1));
15520 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15521 }
15522 default:
15523 break;
15524 }
15525
15526 return Node;
15527}
15528
15529// Any MIMG instructions that use tfe or lwe require an initialization of the
15530// result register that will be written in the case of a memory access failure.
15531// The required code is also added to tie this init code to the result of the
15532// img instruction.
15535 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15536 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15537 MachineBasicBlock &MBB = *MI.getParent();
15538
15539 int DstIdx =
15540 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15541 unsigned InitIdx = 0;
15542
15543 if (TII->isImage(MI)) {
15544 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15545 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15546 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15547
15548 if (!TFE && !LWE) // intersect_ray
15549 return;
15550
15551 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15552 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15553 unsigned D16Val = D16 ? D16->getImm() : 0;
15554
15555 if (!TFEVal && !LWEVal)
15556 return;
15557
15558 // At least one of TFE or LWE are non-zero
15559 // We have to insert a suitable initialization of the result value and
15560 // tie this to the dest of the image instruction.
15561
15562 // Calculate which dword we have to initialize to 0.
15563 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15564
15565 // check that dmask operand is found.
15566 assert(MO_Dmask && "Expected dmask operand in instruction");
15567
15568 unsigned dmask = MO_Dmask->getImm();
15569 // Determine the number of active lanes taking into account the
15570 // Gather4 special case
15571 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15572
15573 bool Packed = !Subtarget->hasUnpackedD16VMem();
15574
15575 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15576
15577 // Abandon attempt if the dst size isn't large enough
15578 // - this is in fact an error but this is picked up elsewhere and
15579 // reported correctly.
15580 uint32_t DstSize =
15581 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15582 if (DstSize < InitIdx)
15583 return;
15584 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15585 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15586 } else {
15587 return;
15588 }
15589
15590 const DebugLoc &DL = MI.getDebugLoc();
15591
15592 // Create a register for the initialization value.
15593 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15594 unsigned NewDst = 0; // Final initialized value will be in here
15595
15596 // If PRTStrictNull feature is enabled (the default) then initialize
15597 // all the result registers to 0, otherwise just the error indication
15598 // register (VGPRn+1)
15599 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15600 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15601
15602 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15603 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15604 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15605 // Initialize dword
15606 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15607 // clang-format off
15608 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15609 .addImm(0);
15610 // clang-format on
15611 // Insert into the super-reg
15612 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15613 .addReg(PrevDst)
15614 .addReg(SubReg)
15616
15617 PrevDst = NewDst;
15618 }
15619
15620 // Add as an implicit operand
15621 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15622
15623 // Tie the just added implicit operand to the dst
15624 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15625}
15626
15627/// Assign the register class depending on the number of
15628/// bits set in the writemask
15630 SDNode *Node) const {
15632
15633 MachineFunction *MF = MI.getParent()->getParent();
15636
15637 if (TII->isVOP3(MI.getOpcode())) {
15638 // Make sure constant bus requirements are respected.
15639 TII->legalizeOperandsVOP3(MRI, MI);
15640
15641 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15642 // This saves a chain-copy of registers and better balance register
15643 // use between vgpr and agpr as agpr tuples tend to be big.
15644 if (!MI.getDesc().operands().empty()) {
15645 unsigned Opc = MI.getOpcode();
15646 bool HasAGPRs = Info->mayNeedAGPRs();
15647 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15648 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15649 for (auto I :
15650 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15651 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15652 if (I == -1)
15653 break;
15654 if ((I == Src2Idx) && (HasAGPRs))
15655 break;
15656 MachineOperand &Op = MI.getOperand(I);
15657 if (!Op.isReg() || !Op.getReg().isVirtual())
15658 continue;
15659 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15660 if (!TRI->hasAGPRs(RC))
15661 continue;
15662 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15663 if (!Src || !Src->isCopy() ||
15664 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15665 continue;
15666 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15667 // All uses of agpr64 and agpr32 can also accept vgpr except for
15668 // v_accvgpr_read, but we do not produce agpr reads during selection,
15669 // so no use checks are needed.
15670 MRI.setRegClass(Op.getReg(), NewRC);
15671 }
15672
15673 if (TII->isMAI(MI)) {
15674 // The ordinary src0, src1, src2 were legalized above.
15675 //
15676 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15677 // as a separate instruction.
15678 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15679 AMDGPU::OpName::scale_src0);
15680 if (Src0Idx != -1) {
15681 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15682 AMDGPU::OpName::scale_src1);
15683 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
15684 TII->usesConstantBus(MRI, MI, Src1Idx))
15685 TII->legalizeOpWithMove(MI, Src1Idx);
15686 }
15687 }
15688
15689 if (!HasAGPRs)
15690 return;
15691
15692 // Resolve the rest of AV operands to AGPRs.
15693 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15694 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15695 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15696 if (TRI->isVectorSuperClass(RC)) {
15697 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15698 MRI.setRegClass(Src2->getReg(), NewRC);
15699 if (Src2->isTied())
15700 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15701 }
15702 }
15703 }
15704 }
15705
15706 return;
15707 }
15708
15709 if (TII->isImage(MI))
15710 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15711}
15712
15714 uint64_t Val) {
15715 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15716 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15717}
15718
15720 const SDLoc &DL,
15721 SDValue Ptr) const {
15723
15724 // Build the half of the subregister with the constants before building the
15725 // full 128-bit register. If we are building multiple resource descriptors,
15726 // this will allow CSEing of the 2-component register.
15727 const SDValue Ops0[] = {
15728 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15729 buildSMovImm32(DAG, DL, 0),
15730 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15731 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15732 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
15733
15734 SDValue SubRegHi = SDValue(
15735 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
15736
15737 // Combine the constants and the pointer.
15738 const SDValue Ops1[] = {
15739 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
15740 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
15741 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
15742
15743 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15744}
15745
15746/// Return a resource descriptor with the 'Add TID' bit enabled
15747/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15748/// of the resource descriptor) to create an offset, which is added to
15749/// the resource pointer.
15751 SDValue Ptr, uint32_t RsrcDword1,
15752 uint64_t RsrcDword2And3) const {
15753 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15754 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15755 if (RsrcDword1) {
15756 PtrHi =
15757 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15758 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15759 0);
15760 }
15761
15762 SDValue DataLo =
15763 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15764 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15765
15766 const SDValue Ops[] = {
15767 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15768 PtrLo,
15769 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15770 PtrHi,
15771 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15772 DataLo,
15773 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15774 DataHi,
15775 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
15776
15777 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15778}
15779
15780//===----------------------------------------------------------------------===//
15781// SI Inline Assembly Support
15782//===----------------------------------------------------------------------===//
15783
15784std::pair<unsigned, const TargetRegisterClass *>
15786 StringRef Constraint,
15787 MVT VT) const {
15788 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15789
15790 const TargetRegisterClass *RC = nullptr;
15791 if (Constraint.size() == 1) {
15792 const unsigned BitWidth = VT.getSizeInBits();
15793 switch (Constraint[0]) {
15794 default:
15795 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15796 case 's':
15797 case 'r':
15798 switch (BitWidth) {
15799 case 16:
15800 RC = &AMDGPU::SReg_32RegClass;
15801 break;
15802 case 64:
15803 RC = &AMDGPU::SGPR_64RegClass;
15804 break;
15805 default:
15807 if (!RC)
15808 return std::pair(0U, nullptr);
15809 break;
15810 }
15811 break;
15812 case 'v':
15813 switch (BitWidth) {
15814 case 16:
15815 RC = &AMDGPU::VGPR_32RegClass;
15816 break;
15817 default:
15818 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15819 if (!RC)
15820 return std::pair(0U, nullptr);
15821 break;
15822 }
15823 break;
15824 case 'a':
15825 if (!Subtarget->hasMAIInsts())
15826 break;
15827 switch (BitWidth) {
15828 case 16:
15829 RC = &AMDGPU::AGPR_32RegClass;
15830 break;
15831 default:
15832 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15833 if (!RC)
15834 return std::pair(0U, nullptr);
15835 break;
15836 }
15837 break;
15838 }
15839 // We actually support i128, i16 and f16 as inline parameters
15840 // even if they are not reported as legal
15841 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15842 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15843 return std::pair(0U, RC);
15844 }
15845
15846 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15847 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15848 if (RegName.consume_front("v")) {
15849 RC = &AMDGPU::VGPR_32RegClass;
15850 } else if (RegName.consume_front("s")) {
15851 RC = &AMDGPU::SGPR_32RegClass;
15852 } else if (RegName.consume_front("a")) {
15853 RC = &AMDGPU::AGPR_32RegClass;
15854 }
15855
15856 if (RC) {
15857 uint32_t Idx;
15858 if (RegName.consume_front("[")) {
15859 uint32_t End;
15860 bool Failed = RegName.consumeInteger(10, Idx);
15861 Failed |= !RegName.consume_front(":");
15862 Failed |= RegName.consumeInteger(10, End);
15863 Failed |= !RegName.consume_back("]");
15864 if (!Failed) {
15865 uint32_t Width = (End - Idx + 1) * 32;
15866 // Prohibit constraints for register ranges with a width that does not
15867 // match the required type.
15868 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
15869 return std::pair(0U, nullptr);
15870 MCRegister Reg = RC->getRegister(Idx);
15872 RC = TRI->getVGPRClassForBitWidth(Width);
15873 else if (SIRegisterInfo::isSGPRClass(RC))
15874 RC = TRI->getSGPRClassForBitWidth(Width);
15875 else if (SIRegisterInfo::isAGPRClass(RC))
15876 RC = TRI->getAGPRClassForBitWidth(Width);
15877 if (RC) {
15878 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15879 return std::pair(Reg, RC);
15880 }
15881 }
15882 } else {
15883 // Check for lossy scalar/vector conversions.
15884 if (VT.isVector() && VT.getSizeInBits() != 32)
15885 return std::pair(0U, nullptr);
15886 bool Failed = RegName.getAsInteger(10, Idx);
15887 if (!Failed && Idx < RC->getNumRegs())
15888 return std::pair(RC->getRegister(Idx), RC);
15889 }
15890 }
15891 }
15892
15893 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15894 if (Ret.first)
15895 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15896
15897 return Ret;
15898}
15899
15900static bool isImmConstraint(StringRef Constraint) {
15901 if (Constraint.size() == 1) {
15902 switch (Constraint[0]) {
15903 default:
15904 break;
15905 case 'I':
15906 case 'J':
15907 case 'A':
15908 case 'B':
15909 case 'C':
15910 return true;
15911 }
15912 } else if (Constraint == "DA" || Constraint == "DB") {
15913 return true;
15914 }
15915 return false;
15916}
15917
15920 if (Constraint.size() == 1) {
15921 switch (Constraint[0]) {
15922 default:
15923 break;
15924 case 's':
15925 case 'v':
15926 case 'a':
15927 return C_RegisterClass;
15928 }
15929 }
15930 if (isImmConstraint(Constraint)) {
15931 return C_Other;
15932 }
15933 return TargetLowering::getConstraintType(Constraint);
15934}
15935
15936static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15938 Val = Val & maskTrailingOnes<uint64_t>(Size);
15939 }
15940 return Val;
15941}
15942
15944 StringRef Constraint,
15945 std::vector<SDValue> &Ops,
15946 SelectionDAG &DAG) const {
15947 if (isImmConstraint(Constraint)) {
15948 uint64_t Val;
15949 if (getAsmOperandConstVal(Op, Val) &&
15950 checkAsmConstraintVal(Op, Constraint, Val)) {
15951 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15952 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15953 }
15954 } else {
15955 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15956 }
15957}
15958
15960 unsigned Size = Op.getScalarValueSizeInBits();
15961 if (Size > 64)
15962 return false;
15963
15964 if (Size == 16 && !Subtarget->has16BitInsts())
15965 return false;
15966
15967 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15968 Val = C->getSExtValue();
15969 return true;
15970 }
15971 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15972 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15973 return true;
15974 }
15975 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15976 if (Size != 16 || Op.getNumOperands() != 2)
15977 return false;
15978 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15979 return false;
15980 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15981 Val = C->getSExtValue();
15982 return true;
15983 }
15984 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15985 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15986 return true;
15987 }
15988 }
15989
15990 return false;
15991}
15992
15994 uint64_t Val) const {
15995 if (Constraint.size() == 1) {
15996 switch (Constraint[0]) {
15997 case 'I':
15999 case 'J':
16000 return isInt<16>(Val);
16001 case 'A':
16002 return checkAsmConstraintValA(Op, Val);
16003 case 'B':
16004 return isInt<32>(Val);
16005 case 'C':
16006 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
16008 default:
16009 break;
16010 }
16011 } else if (Constraint.size() == 2) {
16012 if (Constraint == "DA") {
16013 int64_t HiBits = static_cast<int32_t>(Val >> 32);
16014 int64_t LoBits = static_cast<int32_t>(Val);
16015 return checkAsmConstraintValA(Op, HiBits, 32) &&
16016 checkAsmConstraintValA(Op, LoBits, 32);
16017 }
16018 if (Constraint == "DB") {
16019 return true;
16020 }
16021 }
16022 llvm_unreachable("Invalid asm constraint");
16023}
16024
16026 unsigned MaxSize) const {
16027 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
16028 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
16029 if (Size == 16) {
16030 MVT VT = Op.getSimpleValueType();
16031 switch (VT.SimpleTy) {
16032 default:
16033 return false;
16034 case MVT::i16:
16035 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
16036 case MVT::f16:
16037 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
16038 case MVT::bf16:
16039 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
16040 case MVT::v2i16:
16041 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
16042 case MVT::v2f16:
16043 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
16044 case MVT::v2bf16:
16045 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
16046 }
16047 }
16048 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
16049 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
16050 return true;
16051 return false;
16052}
16053
16054static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16055 switch (UnalignedClassID) {
16056 case AMDGPU::VReg_64RegClassID:
16057 return AMDGPU::VReg_64_Align2RegClassID;
16058 case AMDGPU::VReg_96RegClassID:
16059 return AMDGPU::VReg_96_Align2RegClassID;
16060 case AMDGPU::VReg_128RegClassID:
16061 return AMDGPU::VReg_128_Align2RegClassID;
16062 case AMDGPU::VReg_160RegClassID:
16063 return AMDGPU::VReg_160_Align2RegClassID;
16064 case AMDGPU::VReg_192RegClassID:
16065 return AMDGPU::VReg_192_Align2RegClassID;
16066 case AMDGPU::VReg_224RegClassID:
16067 return AMDGPU::VReg_224_Align2RegClassID;
16068 case AMDGPU::VReg_256RegClassID:
16069 return AMDGPU::VReg_256_Align2RegClassID;
16070 case AMDGPU::VReg_288RegClassID:
16071 return AMDGPU::VReg_288_Align2RegClassID;
16072 case AMDGPU::VReg_320RegClassID:
16073 return AMDGPU::VReg_320_Align2RegClassID;
16074 case AMDGPU::VReg_352RegClassID:
16075 return AMDGPU::VReg_352_Align2RegClassID;
16076 case AMDGPU::VReg_384RegClassID:
16077 return AMDGPU::VReg_384_Align2RegClassID;
16078 case AMDGPU::VReg_512RegClassID:
16079 return AMDGPU::VReg_512_Align2RegClassID;
16080 case AMDGPU::VReg_1024RegClassID:
16081 return AMDGPU::VReg_1024_Align2RegClassID;
16082 case AMDGPU::AReg_64RegClassID:
16083 return AMDGPU::AReg_64_Align2RegClassID;
16084 case AMDGPU::AReg_96RegClassID:
16085 return AMDGPU::AReg_96_Align2RegClassID;
16086 case AMDGPU::AReg_128RegClassID:
16087 return AMDGPU::AReg_128_Align2RegClassID;
16088 case AMDGPU::AReg_160RegClassID:
16089 return AMDGPU::AReg_160_Align2RegClassID;
16090 case AMDGPU::AReg_192RegClassID:
16091 return AMDGPU::AReg_192_Align2RegClassID;
16092 case AMDGPU::AReg_256RegClassID:
16093 return AMDGPU::AReg_256_Align2RegClassID;
16094 case AMDGPU::AReg_512RegClassID:
16095 return AMDGPU::AReg_512_Align2RegClassID;
16096 case AMDGPU::AReg_1024RegClassID:
16097 return AMDGPU::AReg_1024_Align2RegClassID;
16098 default:
16099 return -1;
16100 }
16101}
16102
16103// Figure out which registers should be reserved for stack access. Only after
16104// the function is legalized do we know all of the non-spill stack objects or if
16105// calls are present.
16109 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16110 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16111 const SIInstrInfo *TII = ST.getInstrInfo();
16112
16113 if (Info->isEntryFunction()) {
16114 // Callable functions have fixed registers used for stack access.
16116 }
16117
16118 // TODO: Move this logic to getReservedRegs()
16119 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16120 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16121 Register SReg = ST.isWave32()
16122 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16123 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16124 &AMDGPU::SGPR_64RegClass);
16125 Info->setSGPRForEXECCopy(SReg);
16126
16127 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16128 Info->getStackPtrOffsetReg()));
16129 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16130 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16131
16132 // We need to worry about replacing the default register with itself in case
16133 // of MIR testcases missing the MFI.
16134 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16135 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16136
16137 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16138 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16139
16140 Info->limitOccupancy(MF);
16141
16142 if (ST.isWave32() && !MF.empty()) {
16143 for (auto &MBB : MF) {
16144 for (auto &MI : MBB) {
16145 TII->fixImplicitOperands(MI);
16146 }
16147 }
16148 }
16149
16150 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16151 // classes if required. Ideally the register class constraints would differ
16152 // per-subtarget, but there's no easy way to achieve that right now. This is
16153 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16154 // from using them as the register class for legal types.
16155 if (ST.needsAlignedVGPRs()) {
16156 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16157 const Register Reg = Register::index2VirtReg(I);
16158 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16159 if (!RC)
16160 continue;
16161 int NewClassID = getAlignedAGPRClassID(RC->getID());
16162 if (NewClassID != -1)
16163 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16164 }
16165 }
16166
16168}
16169
16171 KnownBits &Known,
16172 const APInt &DemandedElts,
16173 const SelectionDAG &DAG,
16174 unsigned Depth) const {
16175 Known.resetAll();
16176 unsigned Opc = Op.getOpcode();
16177 switch (Opc) {
16179 unsigned IID = Op.getConstantOperandVal(0);
16180 switch (IID) {
16181 case Intrinsic::amdgcn_mbcnt_lo:
16182 case Intrinsic::amdgcn_mbcnt_hi: {
16183 const GCNSubtarget &ST =
16185 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16186 // most 31 + src1.
16187 Known.Zero.setBitsFrom(
16188 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16189 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16190 Known = KnownBits::add(Known, Known2);
16191 return;
16192 }
16193 }
16194 break;
16195 }
16196 }
16198 Op, Known, DemandedElts, DAG, Depth);
16199}
16200
16202 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16204
16205 // Set the high bits to zero based on the maximum allowed scratch size per
16206 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16207 // calculation won't overflow, so assume the sign bit is never set.
16208 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16209}
16210
16212 KnownBits &Known, unsigned Dim) {
16213 unsigned MaxValue =
16214 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
16215 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16216}
16217
16219 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
16220 const MachineRegisterInfo &MRI, unsigned Depth) const {
16221 const MachineInstr *MI = MRI.getVRegDef(R);
16222 switch (MI->getOpcode()) {
16223 case AMDGPU::G_INTRINSIC:
16224 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16225 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16226 switch (IID) {
16227 case Intrinsic::amdgcn_workitem_id_x:
16228 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
16229 break;
16230 case Intrinsic::amdgcn_workitem_id_y:
16231 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
16232 break;
16233 case Intrinsic::amdgcn_workitem_id_z:
16234 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
16235 break;
16236 case Intrinsic::amdgcn_mbcnt_lo:
16237 case Intrinsic::amdgcn_mbcnt_hi: {
16238 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16239 // most 31 + src1.
16240 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16241 ? getSubtarget()->getWavefrontSizeLog2()
16242 : 5);
16243 KnownBits Known2;
16244 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16245 Depth + 1);
16246 Known = KnownBits::add(Known, Known2);
16247 break;
16248 }
16249 case Intrinsic::amdgcn_groupstaticsize: {
16250 // We can report everything over the maximum size as 0. We can't report
16251 // based on the actual size because we don't know if it's accurate or not
16252 // at any given point.
16253 Known.Zero.setHighBits(
16254 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16255 break;
16256 }
16257 }
16258 break;
16259 }
16260 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16261 Known.Zero.setHighBits(24);
16262 break;
16263 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16264 Known.Zero.setHighBits(16);
16265 break;
16266 case AMDGPU::G_AMDGPU_SMED3:
16267 case AMDGPU::G_AMDGPU_UMED3: {
16268 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16269
16270 KnownBits Known2;
16271 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16272 if (Known2.isUnknown())
16273 break;
16274
16275 KnownBits Known1;
16276 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16277 if (Known1.isUnknown())
16278 break;
16279
16280 KnownBits Known0;
16281 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16282 if (Known0.isUnknown())
16283 break;
16284
16285 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16286 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16287 Known.One = Known0.One & Known1.One & Known2.One;
16288 break;
16289 }
16290 }
16291}
16292
16295 unsigned Depth) const {
16296 const MachineInstr *MI = MRI.getVRegDef(R);
16297 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16298 // FIXME: Can this move to generic code? What about the case where the call
16299 // site specifies a lower alignment?
16300 Intrinsic::ID IID = GI->getIntrinsicID();
16302 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
16303 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16304 return *RetAlign;
16305 }
16306 return Align(1);
16307}
16308
16311 const Align CacheLineAlign = Align(64);
16312
16313 // Pre-GFX10 target did not benefit from loop alignment
16314 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16315 getSubtarget()->hasInstFwdPrefetchBug())
16316 return PrefAlign;
16317
16318 // On GFX10 I$ is 4 x 64 bytes cache lines.
16319 // By default prefetcher keeps one cache line behind and reads two ahead.
16320 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16321 // behind and one ahead.
16322 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16323 // If loop fits 64 bytes it always spans no more than two cache lines and
16324 // does not need an alignment.
16325 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16326 // Else if loop is less or equal 192 bytes we need two lines behind.
16327
16329 const MachineBasicBlock *Header = ML->getHeader();
16330 if (Header->getAlignment() != PrefAlign)
16331 return Header->getAlignment(); // Already processed.
16332
16333 unsigned LoopSize = 0;
16334 for (const MachineBasicBlock *MBB : ML->blocks()) {
16335 // If inner loop block is aligned assume in average half of the alignment
16336 // size to be added as nops.
16337 if (MBB != Header)
16338 LoopSize += MBB->getAlignment().value() / 2;
16339
16340 for (const MachineInstr &MI : *MBB) {
16341 LoopSize += TII->getInstSizeInBytes(MI);
16342 if (LoopSize > 192)
16343 return PrefAlign;
16344 }
16345 }
16346
16347 if (LoopSize <= 64)
16348 return PrefAlign;
16349
16350 if (LoopSize <= 128)
16351 return CacheLineAlign;
16352
16353 // If any of parent loops is surrounded by prefetch instructions do not
16354 // insert new for inner loop, which would reset parent's settings.
16355 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16356 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16357 auto I = Exit->getFirstNonDebugInstr();
16358 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16359 return CacheLineAlign;
16360 }
16361 }
16362
16363 MachineBasicBlock *Pre = ML->getLoopPreheader();
16364 MachineBasicBlock *Exit = ML->getExitBlock();
16365
16366 if (Pre && Exit) {
16367 auto PreTerm = Pre->getFirstTerminator();
16368 if (PreTerm == Pre->begin() ||
16369 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16370 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16371 .addImm(1); // prefetch 2 lines behind PC
16372
16373 auto ExitHead = Exit->getFirstNonDebugInstr();
16374 if (ExitHead == Exit->end() ||
16375 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16376 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16377 .addImm(2); // prefetch 1 line behind PC
16378 }
16379
16380 return CacheLineAlign;
16381}
16382
16384static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16385 assert(N->getOpcode() == ISD::CopyFromReg);
16386 do {
16387 // Follow the chain until we find an INLINEASM node.
16388 N = N->getOperand(0).getNode();
16389 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
16390 return true;
16391 } while (N->getOpcode() == ISD::CopyFromReg);
16392 return false;
16393}
16394
16397 UniformityInfo *UA) const {
16398 switch (N->getOpcode()) {
16399 case ISD::CopyFromReg: {
16400 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16401 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16402 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16403 Register Reg = R->getReg();
16404
16405 // FIXME: Why does this need to consider isLiveIn?
16406 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16407 return !TRI->isSGPRReg(MRI, Reg);
16408
16409 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16410 return UA->isDivergent(V);
16411
16412 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16413 return !TRI->isSGPRReg(MRI, Reg);
16414 }
16415 case ISD::LOAD: {
16416 const LoadSDNode *L = cast<LoadSDNode>(N);
16417 unsigned AS = L->getAddressSpace();
16418 // A flat load may access private memory.
16420 }
16421 case ISD::CALLSEQ_END:
16422 return true;
16424 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16426 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16445 // Target-specific read-modify-write atomics are sources of divergence.
16446 return true;
16447 default:
16448 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16449 // Generic read-modify-write atomics are sources of divergence.
16450 return A->readMem() && A->writeMem();
16451 }
16452 return false;
16453 }
16454}
16455
16457 EVT VT) const {
16458 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16459 case MVT::f32:
16461 case MVT::f64:
16462 case MVT::f16:
16464 default:
16465 return false;
16466 }
16467}
16468
16470 LLT Ty, const MachineFunction &MF) const {
16471 switch (Ty.getScalarSizeInBits()) {
16472 case 32:
16473 return !denormalModeIsFlushAllF32(MF);
16474 case 64:
16475 case 16:
16476 return !denormalModeIsFlushAllF64F16(MF);
16477 default:
16478 return false;
16479 }
16480}
16481
16483 const SelectionDAG &DAG,
16484 bool SNaN,
16485 unsigned Depth) const {
16486 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16487 const MachineFunction &MF = DAG.getMachineFunction();
16489
16490 if (Info->getMode().DX10Clamp)
16491 return true; // Clamped to 0.
16492 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16493 }
16494
16496 Depth);
16497}
16498
16499// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16500// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16502 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16503 return true;
16504
16506 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16507 if (DenormMode == DenormalMode::getPreserveSign())
16508 return true;
16509
16510 // TODO: Remove this.
16511 return RMW->getFunction()
16512 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16513 .getValueAsBool();
16514}
16515
16517 LLVMContext &Ctx = RMW->getContext();
16518 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16519 StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16520
16521 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16522 << "Hardware instruction generated for atomic "
16523 << RMW->getOperationName(RMW->getOperation())
16524 << " operation at memory scope " << MemScope;
16525}
16526
16527static bool isV2F16OrV2BF16(Type *Ty) {
16528 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16529 Type *EltTy = VT->getElementType();
16530 return VT->getNumElements() == 2 &&
16531 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16532 }
16533
16534 return false;
16535}
16536
16537static bool isV2F16(Type *Ty) {
16538 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16539 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16540}
16541
16542static bool isV2BF16(Type *Ty) {
16543 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16544 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16545}
16546
16547/// \return true if atomicrmw integer ops work for the type.
16548static bool isAtomicRMWLegalIntTy(Type *Ty) {
16549 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16550 unsigned BW = IT->getBitWidth();
16551 return BW == 32 || BW == 64;
16552 }
16553
16554 return false;
16555}
16556
16557/// \return true if this atomicrmw xchg type can be selected.
16558static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16559 Type *Ty = RMW->getType();
16560 if (isAtomicRMWLegalIntTy(Ty))
16561 return true;
16562
16563 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16564 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16565 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16566 return BW == 32 || BW == 64;
16567 }
16568
16569 if (Ty->isFloatTy() || Ty->isDoubleTy())
16570 return true;
16571
16572 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16573 return VT->getNumElements() == 2 &&
16574 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16575 }
16576
16577 return false;
16578}
16579
16580/// \returns true if it's valid to emit a native instruction for \p RMW, based
16581/// on the properties of the target memory.
16582static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16583 const AtomicRMWInst *RMW,
16584 bool HasSystemScope) {
16585 // The remote/fine-grained access logic is different from the integer
16586 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16587 // fine-grained access does not work, even for a device local allocation.
16588 //
16589 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16590 // allocations work.
16591 if (HasSystemScope) {
16593 RMW->hasMetadata("amdgpu.no.remote.memory"))
16594 return true;
16596 return true;
16597
16598 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16599}
16600
16601/// \return Action to perform on AtomicRMWInsts for integer operations.
16604 return isAtomicRMWLegalIntTy(RMW->getType())
16607}
16608
16609/// Return if a flat address space atomicrmw can access private memory.
16611 const MDNode *NoaliasAddrSpaceMD =
16612 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16613 if (!NoaliasAddrSpaceMD)
16614 return true;
16615
16616 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16617 ++I) {
16618 auto *Low = mdconst::extract<ConstantInt>(
16619 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16620 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16621 auto *High = mdconst::extract<ConstantInt>(
16622 NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16623 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16624 }
16625 }
16626
16627 return true;
16628}
16629
16632 unsigned AS = RMW->getPointerAddressSpace();
16633 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16635
16636 // 64-bit flat atomics that dynamically reside in private memory will silently
16637 // be dropped.
16638 //
16639 // Note that we will emit a new copy of the original atomic in the expansion,
16640 // which will be incrementally relegalized.
16641 const DataLayout &DL = RMW->getFunction()->getDataLayout();
16642 if (AS == AMDGPUAS::FLAT_ADDRESS &&
16643 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16646
16647 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16649 ORE.emit([=]() {
16650 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16651 });
16652 return Kind;
16653 };
16654
16655 auto SSID = RMW->getSyncScopeID();
16656 bool HasSystemScope =
16657 SSID == SyncScope::System ||
16658 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16659
16660 auto Op = RMW->getOperation();
16661 switch (Op) {
16662 case AtomicRMWInst::Xchg: {
16663 // PCIe supports add and xchg for system atomics.
16664 return isAtomicRMWLegalXChgTy(RMW)
16667 }
16668 case AtomicRMWInst::Add:
16669 case AtomicRMWInst::And:
16673 case AtomicRMWInst::Sub:
16674 case AtomicRMWInst::Or:
16675 case AtomicRMWInst::Xor: {
16676 // Atomic sub/or/xor do not work over PCI express, but atomic add
16677 // does. InstCombine transforms these with 0 to or, so undo that.
16678 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16679 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16680 ConstVal && ConstVal->isNullValue())
16682 }
16683
16685 }
16686 case AtomicRMWInst::FAdd: {
16687 Type *Ty = RMW->getType();
16688
16689 // TODO: Handle REGION_ADDRESS
16690 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16691 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16692 // is fixed to round-to-nearest-even.
16693 //
16694 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16695 // round-to-nearest-even.
16696 //
16697 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16698 // suggests it is OK if the floating-point mode may not match the calling
16699 // thread.
16700 if (Ty->isFloatTy()) {
16703 }
16704
16705 if (Ty->isDoubleTy()) {
16706 // Ignores denormal mode, but we don't consider flushing mandatory.
16709 }
16710
16711 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16713
16715 }
16716
16717 // LDS atomics respect the denormal mode from the mode register.
16718 //
16719 // Traditionally f32 global/buffer memory atomics would unconditionally
16720 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16721 // flush.
16722 //
16723 // On targets with flat atomic fadd, denormals would flush depending on
16724 // whether the target address resides in LDS or global memory. We consider
16725 // this flat-maybe-flush as will-flush.
16726 if (Ty->isFloatTy() &&
16730
16731 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16732 // safe. The message phrasing also should be better.
16733 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16734 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16735 // gfx940, gfx12
16736 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16737 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16738 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16739 // gfx90a, gfx940, gfx12
16740 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16741 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16742
16743 // gfx940, gfx12
16744 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16745 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16746 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16747 // gfx90a, gfx940, gfx12
16748 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16749 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16750
16751 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16752 // buffer. gfx12 does have the buffer version.
16753 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16754 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16755 }
16756
16757 // global and flat atomic fadd f64: gfx90a, gfx940.
16758 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16759 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16760
16761 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16762 if (Ty->isFloatTy()) {
16763 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16764 // gfx11+.
16765 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16766 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16767 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16768 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16769 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16770 } else {
16771 // gfx908
16772 if (RMW->use_empty() &&
16774 isV2F16(Ty))
16775 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16776 }
16777 }
16778
16779 // flat atomic fadd f32: gfx940, gfx11+.
16780 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16781 if (Subtarget->hasFlatAtomicFaddF32Inst())
16782 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16783
16784 // If it is in flat address space, and the type is float, we will try to
16785 // expand it, if the target supports global and lds atomic fadd. The
16786 // reason we need that is, in the expansion, we emit the check of
16787 // address space. If it is in global address space, we emit the global
16788 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16789 // fadd.
16790 if (Subtarget->hasLDSFPAtomicAddF32()) {
16791 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16793 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16795 }
16796 }
16797 }
16798
16800 }
16802 case AtomicRMWInst::FMax: {
16803 Type *Ty = RMW->getType();
16804
16805 // LDS float and double fmin/fmax were always supported.
16806 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16807 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16809 }
16810
16811 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16812 // For flat and global cases:
16813 // float, double in gfx7. Manual claims denormal support.
16814 // Removed in gfx8.
16815 // float, double restored in gfx10.
16816 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16817 //
16818 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16819 // no f32.
16820 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16821 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16822 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16823 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16824 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16825 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16827 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16828 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16829 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16830 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16831 }
16832 }
16833
16835 }
16836 case AtomicRMWInst::Min:
16837 case AtomicRMWInst::Max:
16839 case AtomicRMWInst::UMax: {
16842 // Always expand system scope min/max atomics.
16843 if (HasSystemScope)
16845 }
16846
16848 }
16851 default:
16853 }
16854
16855 llvm_unreachable("covered atomicrmw op switch");
16856}
16857
16863}
16864
16867 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16870}
16871
16874 unsigned AddrSpace = CmpX->getPointerAddressSpace();
16875 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16877
16878 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16880
16881 const DataLayout &DL = CmpX->getDataLayout();
16882
16883 Type *ValTy = CmpX->getNewValOperand()->getType();
16884
16885 // If a 64-bit flat atomic may alias private, we need to avoid using the
16886 // atomic in the private case.
16887 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16889}
16890
16891const TargetRegisterClass *
16892SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16894 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16895 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16896 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
16897 : &AMDGPU::SReg_32RegClass;
16898 if (!TRI->isSGPRClass(RC) && !isDivergent)
16899 return TRI->getEquivalentSGPRClass(RC);
16900 if (TRI->isSGPRClass(RC) && isDivergent)
16901 return TRI->getEquivalentVGPRClass(RC);
16902
16903 return RC;
16904}
16905
16906// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16907// uniform values (as produced by the mask results of control flow intrinsics)
16908// used outside of divergent blocks. The phi users need to also be treated as
16909// always uniform.
16910//
16911// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16912static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16913 unsigned WaveSize) {
16914 // FIXME: We assume we never cast the mask results of a control flow
16915 // intrinsic.
16916 // Early exit if the type won't be consistent as a compile time hack.
16917 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16918 if (!IT || IT->getBitWidth() != WaveSize)
16919 return false;
16920
16921 if (!isa<Instruction>(V))
16922 return false;
16923 if (!Visited.insert(V).second)
16924 return false;
16925 bool Result = false;
16926 for (const auto *U : V->users()) {
16927 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16928 if (V == U->getOperand(1)) {
16929 switch (Intrinsic->getIntrinsicID()) {
16930 default:
16931 Result = false;
16932 break;
16933 case Intrinsic::amdgcn_if_break:
16934 case Intrinsic::amdgcn_if:
16935 case Intrinsic::amdgcn_else:
16936 Result = true;
16937 break;
16938 }
16939 }
16940 if (V == U->getOperand(0)) {
16941 switch (Intrinsic->getIntrinsicID()) {
16942 default:
16943 Result = false;
16944 break;
16945 case Intrinsic::amdgcn_end_cf:
16946 case Intrinsic::amdgcn_loop:
16947 Result = true;
16948 break;
16949 }
16950 }
16951 } else {
16952 Result = hasCFUser(U, Visited, WaveSize);
16953 }
16954 if (Result)
16955 break;
16956 }
16957 return Result;
16958}
16959
16961 const Value *V) const {
16962 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16963 if (CI->isInlineAsm()) {
16964 // FIXME: This cannot give a correct answer. This should only trigger in
16965 // the case where inline asm returns mixed SGPR and VGPR results, used
16966 // outside the defining block. We don't have a specific result to
16967 // consider, so this assumes if any value is SGPR, the overall register
16968 // also needs to be SGPR.
16969 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16971 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16972 for (auto &TC : TargetConstraints) {
16973 if (TC.Type == InlineAsm::isOutput) {
16975 const TargetRegisterClass *RC =
16976 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
16977 TC.ConstraintVT)
16978 .second;
16979 if (RC && SIRI->isSGPRClass(RC))
16980 return true;
16981 }
16982 }
16983 }
16984 }
16986 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16987}
16988
16990 for (SDUse &Use : N->uses()) {
16991 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
16992 if (getBasePtrIndex(M) == Use.getOperandNo())
16993 return true;
16994 }
16995 }
16996 return false;
16997}
16998
17000 SDValue N1) const {
17001 if (!N0.hasOneUse())
17002 return false;
17003 // Take care of the opportunity to keep N0 uniform
17004 if (N0->isDivergent() || !N1->isDivergent())
17005 return true;
17006 // Check if we have a good chance to form the memory access pattern with the
17007 // base and offset
17008 return (DAG.isBaseWithConstantOffset(N0) &&
17010}
17011
17013 Register N0, Register N1) const {
17014 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
17015}
17016
17019 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
17021 if (I.getMetadata("amdgpu.noclobber"))
17022 Flags |= MONoClobber;
17023 if (I.getMetadata("amdgpu.last.use"))
17024 Flags |= MOLastUse;
17025 return Flags;
17026}
17027
17029 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
17030 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
17031 if (User->getOpcode() != ISD::CopyToReg)
17032 return false;
17033 if (!Def->isMachineOpcode())
17034 return false;
17035 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
17036 if (!MDef)
17037 return false;
17038
17039 unsigned ResNo = User->getOperand(Op).getResNo();
17040 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
17041 return false;
17042 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
17043 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17044 PhysReg = AMDGPU::SCC;
17045 const TargetRegisterClass *RC =
17046 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17047 Cost = RC->getCopyCost();
17048 return true;
17049 }
17050 return false;
17051}
17052
17053/// Check if it is profitable to hoist instruction in then/else to if.
17055 if (!I->hasOneUse())
17056 return true;
17057
17058 Instruction *User = I->user_back();
17059 // TODO: Add more patterns that are not profitable to hoist and
17060 // handle modifiers such as fabs and fneg
17061 switch (I->getOpcode()) {
17062 case Instruction::FMul: {
17063 if (User->getOpcode() != Instruction::FSub &&
17064 User->getOpcode() != Instruction::FAdd)
17065 return true;
17066
17068
17069 return ((!I->hasAllowContract() || !User->hasAllowContract()) &&
17070 Options.AllowFPOpFusion != FPOpFusion::Fast &&
17071 !Options.UnsafeFPMath) ||
17072 !isFMAFasterThanFMulAndFAdd(*I->getFunction(), User->getType());
17073 }
17074 default:
17075 return true;
17076 }
17077 return true;
17078}
17079
17081 Instruction *AI) const {
17082 // Given: atomicrmw fadd ptr %addr, float %val ordering
17083 //
17084 // With this expansion we produce the following code:
17085 // [...]
17086 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17087 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17088 //
17089 // atomicrmw.shared:
17090 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17091 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17092 // float %val ordering
17093 // br label %atomicrmw.phi
17094 //
17095 // atomicrmw.check.private:
17096 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17097 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17098 //
17099 // atomicrmw.private:
17100 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17101 // %loaded.private = load float, ptr addrspace(5) %cast.private
17102 // %val.new = fadd float %loaded.private, %val
17103 // store float %val.new, ptr addrspace(5) %cast.private
17104 // br label %atomicrmw.phi
17105 //
17106 // atomicrmw.global:
17107 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17108 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17109 // float %val ordering
17110 // br label %atomicrmw.phi
17111 //
17112 // atomicrmw.phi:
17113 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17114 // [ %loaded.private, %atomicrmw.private ],
17115 // [ %loaded.global, %atomicrmw.global ]
17116 // br label %atomicrmw.end
17117 //
17118 // atomicrmw.end:
17119 // [...]
17120 //
17121 //
17122 // For 64-bit atomics which may reside in private memory, we perform a simpler
17123 // version that only inserts the private check, and uses the flat operation.
17124
17125 IRBuilder<> Builder(AI);
17126 LLVMContext &Ctx = Builder.getContext();
17127
17128 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17129 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17131 Value *Addr = AI->getOperand(PtrOpIdx);
17132
17133 /// TODO: Only need to check private, then emit flat-known-not private (no
17134 /// need for shared block, or cast to global).
17135 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17136
17137 Align Alignment;
17138 if (RMW)
17139 Alignment = RMW->getAlign();
17140 else if (CX)
17141 Alignment = CX->getAlign();
17142 else
17143 llvm_unreachable("unhandled atomic operation");
17144
17145 // FullFlatEmulation is true if we need to issue the private, shared, and
17146 // global cases.
17147 //
17148 // If this is false, we are only dealing with the flat-targeting-private case,
17149 // where we only insert a check for private and still use the flat instruction
17150 // for global and shared.
17151
17152 bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17153 Subtarget->hasAtomicFaddInsts() &&
17154 RMW->getType()->isFloatTy();
17155
17156 // If the return value isn't used, do not introduce a false use in the phi.
17157 bool ReturnValueIsUsed = !AI->use_empty();
17158
17159 BasicBlock *BB = Builder.GetInsertBlock();
17160 Function *F = BB->getParent();
17161 BasicBlock *ExitBB =
17162 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17163 BasicBlock *SharedBB = nullptr;
17164
17165 BasicBlock *CheckPrivateBB = BB;
17166 if (FullFlatEmulation) {
17167 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17168 CheckPrivateBB =
17169 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17170 }
17171
17172 BasicBlock *PrivateBB =
17173 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17174 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17175 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17176
17177 std::prev(BB->end())->eraseFromParent();
17178 Builder.SetInsertPoint(BB);
17179
17180 Value *LoadedShared = nullptr;
17181 if (FullFlatEmulation) {
17182 CallInst *IsShared = Builder.CreateIntrinsic(
17183 Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
17184 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17185 Builder.SetInsertPoint(SharedBB);
17186 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17188
17189 Instruction *Clone = AI->clone();
17190 Clone->insertInto(SharedBB, SharedBB->end());
17191 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17192 LoadedShared = Clone;
17193
17194 Builder.CreateBr(PhiBB);
17195 Builder.SetInsertPoint(CheckPrivateBB);
17196 }
17197
17198 CallInst *IsPrivate = Builder.CreateIntrinsic(
17199 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
17200 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17201
17202 Builder.SetInsertPoint(PrivateBB);
17203
17204 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17206
17207 Value *LoadedPrivate;
17208 if (RMW) {
17209 LoadedPrivate = Builder.CreateAlignedLoad(
17210 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17211
17212 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17213 LoadedPrivate, RMW->getValOperand());
17214
17215 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17216 } else {
17217 auto [ResultLoad, Equal] =
17218 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17219 CX->getNewValOperand(), CX->getAlign());
17220
17221 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17222 ResultLoad, 0);
17223 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17224 }
17225
17226 Builder.CreateBr(PhiBB);
17227
17228 Builder.SetInsertPoint(GlobalBB);
17229
17230 // Continue using a flat instruction if we only emitted the check for private.
17231 Instruction *LoadedGlobal = AI;
17232 if (FullFlatEmulation) {
17233 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17235 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17236 }
17237
17238 AI->removeFromParent();
17239 AI->insertInto(GlobalBB, GlobalBB->end());
17240
17241 // The new atomicrmw may go through another round of legalization later.
17242 if (!FullFlatEmulation) {
17243 // We inserted the runtime check already, make sure we do not try to
17244 // re-expand this.
17245 // TODO: Should union with any existing metadata.
17246 MDBuilder MDB(F->getContext());
17247 MDNode *RangeNotPrivate =
17250 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17251 RangeNotPrivate);
17252 }
17253
17254 Builder.CreateBr(PhiBB);
17255
17256 Builder.SetInsertPoint(PhiBB);
17257
17258 if (ReturnValueIsUsed) {
17259 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17260 AI->replaceAllUsesWith(Loaded);
17261 if (FullFlatEmulation)
17262 Loaded->addIncoming(LoadedShared, SharedBB);
17263 Loaded->addIncoming(LoadedPrivate, PrivateBB);
17264 Loaded->addIncoming(LoadedGlobal, GlobalBB);
17265 Loaded->takeName(AI);
17266 }
17267
17268 Builder.CreateBr(ExitBB);
17269}
17270
17273
17276 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17277 ConstVal && ConstVal->isNullValue()) {
17278 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17280
17281 // We may still need the private-alias-flat handling below.
17282
17283 // TODO: Skip this for cases where we cannot access remote memory.
17284 }
17285 }
17286
17287 // The non-flat expansions should only perform the de-canonicalization of
17288 // identity values.
17290 return;
17291
17293}
17294
17297}
17298
17299LoadInst *
17301 IRBuilder<> Builder(AI);
17302 auto Order = AI->getOrdering();
17303
17304 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17305 // must be flushed if the atomic ordering had a release semantics. This is
17306 // not necessary a fence, a release fence just coincides to do that flush.
17307 // Avoid replacing of an atomicrmw with a release semantics.
17308 if (isReleaseOrStronger(Order))
17309 return nullptr;
17310
17311 LoadInst *LI = Builder.CreateAlignedLoad(
17312 AI->getType(), AI->getPointerOperand(), AI->getAlign());
17313 LI->setAtomic(Order, AI->getSyncScopeID());
17314 LI->copyMetadata(*AI);
17315 LI->takeName(AI);
17316 AI->replaceAllUsesWith(LI);
17317 AI->eraseFromParent();
17318 return LI;
17319}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1214
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1211
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1122
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5463
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1489
bool isNegative() const
Definition: APFloat.h:1445
APInt bitcastToAPInt() const
Definition: APFloat.h:1351
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1100
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1081
bool isInfinity() const
Definition: APFloat.h:1442
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition: Function.cpp:349
const Function * getParent() const
Definition: Argument.h:43
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:640
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:544
static unsigned getPointerOperandIndex()
Definition: Instructions.h:631
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
static unsigned getPointerOperandIndex()
Definition: Instructions.h:872
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
void setOperation(BinOp Operation)
Definition: Instructions.h:821
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:878
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:378
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1451
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
bool isSigned() const
Definition: InstrTypes.h:928
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Definition: DerivedTypes.h:105
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
iterator_range< arg_iterator > args()
Definition: Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:807
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasPrefetch() const
Definition: GCNSubtarget.h:962
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:905
bool hasD16Images() const
Definition: GCNSubtarget.h:710
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:867
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:863
bool hasDot7Insts() const
Definition: GCNSubtarget.h:809
bool hasApertureRegs() const
Definition: GCNSubtarget.h:611
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:859
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:421
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:912
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:690
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:537
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:595
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasDot1Insts() const
Definition: GCNSubtarget.h:785
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:875
Align getStackAlignment() const
Definition: GCNSubtarget.h:975
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:465
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
bool hasMadF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:637
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:471
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:895
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:755
bool useDS128() const
Definition: GCNSubtarget.h:547
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:467
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:851
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:437
bool hasIntClamp() const
Definition: GCNSubtarget.h:367
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:645
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:744
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:346
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:942
bool hasFFBL() const
Definition: GCNSubtarget.h:425
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:569
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:855
bool hasMed3_16() const
Definition: GCNSubtarget.h:433
bool hasUnalignedScratchAccessEnabled() const
Definition: GCNSubtarget.h:603
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:869
bool hasBFI() const
Definition: GCNSubtarget.h:413
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:587
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
Definition: GCNSubtarget.h:821
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:532
bool hasFFBH() const
Definition: GCNSubtarget.h:429
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:871
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:879
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:891
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:877
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:899
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:557
bool hasDot8Insts() const
Definition: GCNSubtarget.h:813
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:552
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:883
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:742
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:887
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:441
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasFractBug() const
Definition: GCNSubtarget.h:405
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:409
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:725
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1164
LLVMContext & getContext() const
Definition: IRBuilder.h:195
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1158
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2157
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:368
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:218
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
Definition: Instructions.h:176
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:261
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95
Metadata node.
Definition: Metadata.h:1073
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:750
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:982
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:801
const Pass * getPass() const
Definition: SelectionDAG.h:493
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:856
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:827
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:712
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:700
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:873
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:586
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:404
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:255
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void set(Value *Val)
Definition: Value.h:886
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1347
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1069
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1340
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1342
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1343
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1325
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1490
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1345
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1259
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1118
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1341
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:522
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1348
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1087
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1064
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1319
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1344
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1168
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1350
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1165
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1349
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1639
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1606
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1586
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
Definition: Intrinsics.cpp:746
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1609
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:55
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:246
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:40
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:557
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:287
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:156
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:161
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:237
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals