LLVM 23.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
45#include "llvm/IR/MDBuilder.h"
48#include "llvm/Support/ModRef.h"
50#include <optional>
51
52using namespace llvm;
53using namespace llvm::SDPatternMatch;
54
55#define DEBUG_TYPE "si-lower"
56
57STATISTIC(NumTailCalls, "Number of tail calls");
58
59static cl::opt<bool>
60 DisableLoopAlignment("amdgpu-disable-loop-alignment",
61 cl::desc("Do not align and prefetch loops"),
62 cl::init(false));
63
65 "amdgpu-use-divergent-register-indexing", cl::Hidden,
66 cl::desc("Use indirect register addressing for divergent indexes"),
67 cl::init(false));
68
73
78
79static unsigned findFirstFreeSGPR(CCState &CCInfo) {
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
82 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
83 return AMDGPU::SGPR0 + Reg;
84 }
85 }
86 llvm_unreachable("Cannot allocate sgpr");
87}
88
90 const GCNSubtarget &STI)
91 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
92 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
93 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
94
95 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V32RegClass =
99 TRI->getDefaultVectorSuperClassForBitWidth(32);
100 addRegisterClass(MVT::f32, V32RegClass);
101
102 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
103
104 const TargetRegisterClass *V64RegClass =
105 TRI->getDefaultVectorSuperClassForBitWidth(64);
106
107 addRegisterClass(MVT::f64, V64RegClass);
108 addRegisterClass(MVT::v2f32, V64RegClass);
109 addRegisterClass(MVT::Untyped, V64RegClass);
110
111 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
112 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
113
114 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
115 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
116
117 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
118 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
119
120 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
121 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
122
123 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
124 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
125
126 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
127 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
128
129 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
130 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
131
132 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
133 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
134
135 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
136 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
137
138 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
139 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
140
141 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
142 addRegisterClass(MVT::v10f32,
143 TRI->getDefaultVectorSuperClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32,
147 TRI->getDefaultVectorSuperClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32,
151 TRI->getDefaultVectorSuperClassForBitWidth(384));
152
153 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
154 addRegisterClass(MVT::v16f32,
155 TRI->getDefaultVectorSuperClassForBitWidth(512));
156
157 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
158 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
159
160 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
161 addRegisterClass(MVT::v16f64,
162 TRI->getDefaultVectorSuperClassForBitWidth(1024));
163
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
166 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
169 } else {
170 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
171 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
173 }
174
175 // Unless there are also VOP3P operations, not operations are really legal.
176 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
177 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
180 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
183 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
186 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
189 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
191 }
192
193 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
194 addRegisterClass(MVT::v32f32,
195 TRI->getDefaultVectorSuperClassForBitWidth(1024));
196
197 computeRegisterProperties(Subtarget->getRegisterInfo());
198
199 // The boolean content concept here is too inflexible. Compares only ever
200 // really produce a 1-bit result. Any copy/extend from these will turn into a
201 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
202 // it's what most targets use.
205
206 // We need to custom lower vector stores from local memory
208 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
209 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
210 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
211 MVT::i1, MVT::v32i32},
212 Custom);
213
215 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
216 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
217 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
218 MVT::i1, MVT::v32i32},
219 Custom);
220
221 if (isTypeLegal(MVT::bf16)) {
222 for (unsigned Opc :
231 ISD::SETCC}) {
232 setOperationAction(Opc, MVT::bf16, Promote);
233 }
234
236
238 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
239
243
244 // We only need to custom lower because we can't specify an action for bf16
245 // sources.
248 }
249
250 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
251 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
252 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
253 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
254 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
255 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
256 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
257 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
258 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
259 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
260 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
261 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
262 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
263 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
264 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
265 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
266
267 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
268 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
269 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
270 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
271 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
272 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
273 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
274
275 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
276 setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
277
281 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
282
283 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
284
286 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
287
289 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
290 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
291
293 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
294 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
295 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
296 Expand);
298 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
299 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
300 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
301 Expand);
302
304 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
305 MVT::v3i16, MVT::v4i16, MVT::Other},
306 Custom);
307
310 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
311
313
315
317 Expand);
318
319#if 0
321#endif
322
323 // We only support LOAD/STORE and vector manipulation ops for vectors
324 // with > 4 elements.
325 for (MVT VT :
326 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
327 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
328 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
329 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
330 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
331 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
332 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
333 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
334 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
335 switch (Op) {
336 case ISD::LOAD:
337 case ISD::STORE:
339 case ISD::BITCAST:
340 case ISD::UNDEF:
344 case ISD::IS_FPCLASS:
345 break;
350 break;
351 default:
353 break;
354 }
355 }
356 }
357
359
360 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
361 // is expanded to avoid having two separate loops in case the index is a VGPR.
362
363 // Most operations are naturally 32-bit vector operations. We only support
364 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
365 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
377 }
378
379 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
391 }
392
393 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
405 }
406
407 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
419 }
420
421 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
423 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
424
426 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
430
432 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
433 }
434
436 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
437 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
438 Custom);
439
440 if (Subtarget->hasPkMovB32()) {
441 // TODO: 16-bit element vectors should be legal with even aligned elements.
442 // TODO: Can be legal with wider source types than the result with
443 // subregister extracts.
444 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
445 }
446
448 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
449 // instead lower to cndmask in SITargetLowering::LowerSELECT().
451 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
452 // alignbit.
453 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
454
455 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
456 Custom);
457
458 // Avoid stack access for these.
459 // TODO: Generalize to more vector types.
461 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
462 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
463 Custom);
464
465 // Deal with vec3 vector operations when widened to vec4.
467 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
468
469 // Deal with vec5/6/7 vector operations when widened to vec8.
471 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
472 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
473 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
474 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
475 Custom);
476
477 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
478 // and output demarshalling
479 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
480
481 // We can't return success/failure, only the old value,
482 // let LLVM add the comparison
484 Expand);
485
486 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
487
488 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
489
490 // FIXME: This should be narrowed to i32, but that only happens if i64 is
491 // illegal.
492 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
493 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
494
495 // On SI this is s_memtime and s_memrealtime on VI.
497
498 if (Subtarget->hasSMemRealTime() ||
499 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
502
503 if (Subtarget->has16BitInsts()) {
506 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
509 } else {
511 }
512
513 if (Subtarget->hasMadMacF32Insts())
515
518
519 // We only really have 32-bit BFE instructions (and 16-bit on VI).
520 //
521 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
522 // effort to match them now. We want this to be false for i64 cases when the
523 // extraction isn't restricted to the upper or lower half. Ideally we would
524 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
525 // span the midpoint are probably relatively rare, so don't worry about them
526 // for now.
528
529 // Clamp modifier on add/sub
530 if (Subtarget->hasIntClamp())
532
533 if (Subtarget->hasAddNoCarryInsts())
534 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
535 Legal);
536
539 {MVT::f32, MVT::f64}, Custom);
540
541 // These are really only legal for ieee_mode functions. We should be avoiding
542 // them for functions that don't have ieee_mode enabled, so just say they are
543 // legal.
545 {MVT::f32, MVT::f64}, Legal);
546
547 if (Subtarget->haveRoundOpsF64())
549 Legal);
550 else
552 MVT::f64, Custom);
553
555 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
556 Legal);
557 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
558
561
562 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
564
566 Custom);
568 Custom);
570 Custom);
571
572 // Custom lower these because we can't specify a rule based on an illegal
573 // source bf16.
576
577 if (Subtarget->has16BitInsts()) {
580 MVT::i16, Legal);
581
582 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
583
585 MVT::i16, Expand);
586
590 ISD::CTPOP},
591 MVT::i16, Promote);
592
594
595 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
596
598 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
600 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
601
605
607
608 // F16 - Constant Actions.
611
612 // F16 - Load/Store Actions.
614 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
616 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
617
618 // BF16 - Load/Store Actions.
620 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
622 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
623
624 // F16 - VOP1 Actions.
627 MVT::f16, Custom);
628
629 // BF16 - VOP1 Actions.
630 if (Subtarget->hasBF16TransInsts())
632
635 MVT::f16, Promote);
638 MVT::bf16, Promote);
639
640 // F16 - VOP2 Actions.
641 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
642 Expand);
646
647 // F16 - VOP3 Actions.
649 if (STI.hasMadF16())
651
652 for (MVT VT :
653 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
654 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
655 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
656 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
657 switch (Op) {
658 case ISD::LOAD:
659 case ISD::STORE:
661 case ISD::BITCAST:
662 case ISD::UNDEF:
667 case ISD::IS_FPCLASS:
668 break;
671 case ISD::FSIN:
672 case ISD::FCOS:
674 break;
675 default:
677 break;
678 }
679 }
680 }
681
682 // v_perm_b32 can handle either of these.
683 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
685
686 // XXX - Do these do anything? Vector constants turn into build_vector.
687 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
688
689 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
690 Legal);
691
693 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
695 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
696
698 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
700 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
701
702 setOperationAction(ISD::AND, MVT::v2i16, Promote);
703 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
704 setOperationAction(ISD::OR, MVT::v2i16, Promote);
705 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
706 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
707 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
708
710 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
712 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
713 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
715
717 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
719 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
721 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
722
724 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
726 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
727 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
728 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
729
731 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
733 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
734
736 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
738 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
740 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
741
742 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
744 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
746 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
748
750 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
752 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
753 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
754 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
755
756 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
757 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
758 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
759 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
760 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
761 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
762
764 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
766 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
767 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
768 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
769
771 MVT::v2i32, Expand);
773
775 MVT::v4i32, Expand);
776
778 MVT::v8i32, Expand);
779
780 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
781 Subtarget->hasVOP3PInsts() ? Legal : Custom);
782
783 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
784 // This isn't really legal, but this avoids the legalizer unrolling it (and
785 // allows matching fneg (fabs x) patterns)
786 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
787
788 // Can do this in one BFI plus a constant materialize.
790 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
791 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
792 MVT::v32f16, MVT::v32bf16},
793 Custom);
794
797 MVT::f16, Custom);
799
802 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
803 Custom);
804
806 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
807 Expand);
808
809 for (MVT Vec16 :
810 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
811 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
814 Vec16, Custom);
816 }
817 }
818
819 if (Subtarget->hasVOP3PInsts()) {
823 MVT::v2i16, Legal);
824
827 MVT::v2f16, Legal);
828
830 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
831
833 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
834 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
835 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
836 Custom);
837
838 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
839 // Split vector operations.
844 VT, Custom);
845
846 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
847 // Split vector operations.
849 VT, Custom);
850
853 {MVT::v2f16, MVT::v4f16}, Custom);
854
855 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
856 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
857 Custom);
858
859 if (Subtarget->hasBF16PackedInsts()) {
860 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
861 // Split vector operations.
863 VT, Custom);
864 }
865
866 if (Subtarget->hasPackedFP32Ops()) {
868 MVT::v2f32, Legal);
870 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
871 Custom);
872 }
873 }
874
876
877 if (Subtarget->has16BitInsts()) {
879 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
881 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
882 } else {
883 // Legalization hack.
884 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
885
887 }
888
890 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
891 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
892 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
893 MVT::v32f16, MVT::v32bf16},
894 Custom);
895
897
898 if (Subtarget->hasVectorMulU64())
900 else if (Subtarget->hasScalarSMulU64())
902
903 if (Subtarget->hasMad64_32())
905
906 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
908
909 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
911 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
912 } else {
913 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
914 if (Subtarget->hasMinimum3Maximum3F32())
916
917 if (Subtarget->hasMinimum3Maximum3PKF16()) {
919
920 // If only the vector form is available, we need to widen to a vector.
921 if (!Subtarget->hasMinimum3Maximum3F16())
923 }
924 }
925
926 if (Subtarget->hasVOP3PInsts()) {
927 // We want to break these into v2f16 pieces, not scalarize.
929 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
930 Custom);
931 }
932
933 if (Subtarget->hasIntMinMax64())
935 Legal);
936
938 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
939 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
940 MVT::i8},
941 Custom);
942
944 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
945 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
946 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
947 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
948 Custom);
949
951 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
952 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
953 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
954 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
955 Custom);
956
962
963 // TODO: Could move this to custom lowering, could benefit from combines on
964 // extract of relevant bits.
966
968
969 if (Subtarget->hasBF16ConversionInsts()) {
970 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
972 }
973
974 if (Subtarget->hasBF16PackedInsts()) {
977 MVT::v2bf16, Legal);
978 }
979
980 if (Subtarget->hasBF16TransInsts()) {
982 }
983
984 if (Subtarget->hasCvtPkF16F32Inst()) {
986 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
987 Custom);
988 }
989
993 ISD::SUB,
995 ISD::MUL,
996 ISD::FADD,
997 ISD::FSUB,
998 ISD::FDIV,
999 ISD::FMUL,
1008 ISD::FMA,
1009 ISD::SMIN,
1010 ISD::SMAX,
1011 ISD::UMIN,
1012 ISD::UMAX,
1013 ISD::SETCC,
1015 ISD::SMIN,
1016 ISD::SMAX,
1017 ISD::UMIN,
1018 ISD::UMAX,
1019 ISD::AND,
1020 ISD::OR,
1021 ISD::XOR,
1022 ISD::SHL,
1023 ISD::SRL,
1024 ISD::SRA,
1025 ISD::FSHR,
1036
1037 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1039
1040 // All memory operations. Some folding on the pointer operand is done to help
1041 // matching the constant offsets in the addressing modes.
1043 ISD::STORE,
1068
1069 // FIXME: In other contexts we pretend this is a per-function property.
1071
1073}
1074
1075const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1076
1078 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1079 return RCRegs;
1080}
1081
1082//===----------------------------------------------------------------------===//
1083// TargetLowering queries
1084//===----------------------------------------------------------------------===//
1085
1086// v_mad_mix* support a conversion from f16 to f32.
1087//
1088// There is only one special case when denormals are enabled we don't currently,
1089// where this is OK to use.
1090bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1091 EVT DestVT, EVT SrcVT) const {
1092 return DestVT.getScalarType() == MVT::f32 &&
1093 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1094 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1095 SrcVT.getScalarType() == MVT::f16) ||
1096 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1097 SrcVT.getScalarType() == MVT::bf16)) &&
1098 // TODO: This probably only requires no input flushing?
1100}
1101
1103 LLT DestTy, LLT SrcTy) const {
1104 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1105 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1106 DestTy.getScalarSizeInBits() == 32 &&
1107 SrcTy.getScalarSizeInBits() == 16 &&
1108 // TODO: This probably only requires no input flushing?
1109 denormalModeIsFlushAllF32(*MI.getMF());
1110}
1111
1113 // SI has some legal vector types, but no legal vector operations. Say no
1114 // shuffles are legal in order to prefer scalarizing some vector operations.
1115 return false;
1116}
1117
1119 CallingConv::ID CC,
1120 EVT VT) const {
1122 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1123
1124 if (VT.isVector()) {
1125 EVT ScalarVT = VT.getScalarType();
1126 unsigned Size = ScalarVT.getSizeInBits();
1127 if (Size == 16) {
1128 return Subtarget->has16BitInsts()
1129 ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2)
1130 : MVT::i32;
1131 }
1132
1133 if (Size < 16)
1134 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1135 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1136 }
1137
1138 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1139 return MVT::i32;
1140
1141 if (VT.getSizeInBits() > 32)
1142 return MVT::i32;
1143
1144 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1145}
1146
1148 CallingConv::ID CC,
1149 EVT VT) const {
1151 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1152
1153 if (VT.isVector()) {
1154 unsigned NumElts = VT.getVectorNumElements();
1155 EVT ScalarVT = VT.getScalarType();
1156 unsigned Size = ScalarVT.getSizeInBits();
1157
1158 // FIXME: Should probably promote 8-bit vectors to i16.
1159 if (Size == 16)
1160 return (NumElts + 1) / 2;
1161
1162 if (Size <= 32)
1163 return NumElts;
1164
1165 if (Size > 32)
1166 return NumElts * ((Size + 31) / 32);
1167 } else if (VT.getSizeInBits() > 32)
1168 return (VT.getSizeInBits() + 31) / 32;
1169
1170 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1171}
1172
1174 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1175 unsigned &NumIntermediates, MVT &RegisterVT) const {
1176 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1177 unsigned NumElts = VT.getVectorNumElements();
1178 EVT ScalarVT = VT.getScalarType();
1179 unsigned Size = ScalarVT.getSizeInBits();
1180 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1181 // support, but unless we can properly handle 3-vectors, it will be still be
1182 // inconsistent.
1183 if (Size == 16) {
1184 MVT SimpleIntermediateVT =
1186 IntermediateVT = SimpleIntermediateVT;
1187 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1188 NumIntermediates = (NumElts + 1) / 2;
1189 return (NumElts + 1) / 2;
1190 }
1191
1192 if (Size == 32) {
1193 RegisterVT = ScalarVT.getSimpleVT();
1194 IntermediateVT = RegisterVT;
1195 NumIntermediates = NumElts;
1196 return NumIntermediates;
1197 }
1198
1199 if (Size < 16 && Subtarget->has16BitInsts()) {
1200 // FIXME: Should probably form v2i16 pieces
1201 RegisterVT = MVT::i16;
1202 IntermediateVT = ScalarVT;
1203 NumIntermediates = NumElts;
1204 return NumIntermediates;
1205 }
1206
1207 if (Size != 16 && Size <= 32) {
1208 RegisterVT = MVT::i32;
1209 IntermediateVT = ScalarVT;
1210 NumIntermediates = NumElts;
1211 return NumIntermediates;
1212 }
1213
1214 if (Size > 32) {
1215 RegisterVT = MVT::i32;
1216 IntermediateVT = RegisterVT;
1217 NumIntermediates = NumElts * ((Size + 31) / 32);
1218 return NumIntermediates;
1219 }
1220 }
1221
1223 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1224}
1225
1227 const DataLayout &DL, Type *Ty,
1228 unsigned MaxNumLanes) {
1229 assert(MaxNumLanes != 0);
1230
1231 LLVMContext &Ctx = Ty->getContext();
1232 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1233 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1234 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1235 NumElts);
1236 }
1237
1238 return TLI.getValueType(DL, Ty);
1239}
1240
1241// Peek through TFE struct returns to only use the data size.
1243 const DataLayout &DL, Type *Ty,
1244 unsigned MaxNumLanes) {
1245 auto *ST = dyn_cast<StructType>(Ty);
1246 if (!ST)
1247 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1248
1249 // TFE intrinsics return an aggregate type.
1250 assert(ST->getNumContainedTypes() == 2 &&
1251 ST->getContainedType(1)->isIntegerTy(32));
1252 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1253}
1254
1255/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1256/// in-memory representation. This return value is a custom type because there
1257/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1258/// could cause issues during codegen, these address space 7 pointers will be
1259/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1260/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1261/// for cost modeling, to work. (This also sets us up decently for doing the
1262/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1264 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1265 return MVT::amdgpuBufferFatPointer;
1267 DL.getPointerSizeInBits(AS) == 192)
1268 return MVT::amdgpuBufferStridedPointer;
1270}
1271/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1272/// v8i32 when padding is added.
1273/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1274/// also v8i32 with padding.
1276 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1277 DL.getPointerSizeInBits(AS) == 160) ||
1279 DL.getPointerSizeInBits(AS) == 192))
1280 return MVT::v8i32;
1282}
1283
1284static unsigned getIntrMemWidth(unsigned IntrID) {
1285 switch (IntrID) {
1286 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1287 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1288 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1289 return 8;
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1295 case Intrinsic::amdgcn_flat_load_monitor_b32:
1296 case Intrinsic::amdgcn_global_load_monitor_b32:
1297 return 32;
1298 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1300 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1301 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1302 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1303 case Intrinsic::amdgcn_flat_load_monitor_b64:
1304 case Intrinsic::amdgcn_global_load_monitor_b64:
1305 return 64;
1306 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1307 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1308 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1309 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1310 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1311 case Intrinsic::amdgcn_flat_load_monitor_b128:
1312 case Intrinsic::amdgcn_global_load_monitor_b128:
1313 return 128;
1314 default:
1315 llvm_unreachable("Unknown width");
1316 }
1317}
1318
1320 unsigned ArgIdx) {
1321 Value *OrderingArg = CI.getArgOperand(ArgIdx);
1322 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1323 switch (AtomicOrderingCABI(Ord)) {
1326 break;
1329 break;
1332 break;
1333 default:
1335 }
1336}
1337
1338static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
1339 MDNode *ScopeMD = cast<MDNode>(
1340 cast<MetadataAsValue>(CI.getArgOperand(ArgIdx))->getMetadata());
1341 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1342 return CI.getContext().getOrInsertSyncScopeID(Scope);
1343}
1344
1346 const CallBase &CI,
1347 MachineFunction &MF,
1348 unsigned IntrID) const {
1350 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1352 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1354 Flags |= getTargetMMOFlags(CI);
1355
1356 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1358 AttributeSet Attr =
1360 MemoryEffects ME = Attr.getMemoryEffects();
1361 if (ME.doesNotAccessMemory())
1362 return;
1363
1364 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1365 if (!IsSPrefetch) {
1366 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1367 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1369 }
1371
1372 IntrinsicInfo Info;
1373 // TODO: Should images get their own address space?
1375
1376 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1377 if (RsrcIntr->IsImage) {
1378 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1380 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1381 Info.align.reset();
1382 }
1383
1384 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1385 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1386 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1387 // We conservatively set the memory operand of a buffer intrinsic to the
1388 // base resource pointer, so that we can access alias information about
1389 // those pointers. Cases like "this points at the same value
1390 // but with a different offset" are handled in
1391 // areMemAccessesTriviallyDisjoint.
1392 Info.ptrVal = RsrcArg;
1393 }
1394
1395 if (ME.onlyReadsMemory()) {
1396 if (RsrcIntr->IsImage) {
1397 unsigned MaxNumLanes = 4;
1398
1399 if (!BaseOpcode->Gather4) {
1400 // If this isn't a gather, we may have excess loaded elements in the
1401 // IR type. Check the dmask for the real number of elements loaded.
1402 unsigned DMask =
1403 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1404 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1405 }
1406
1407 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1408 CI.getType(), MaxNumLanes);
1409 } else {
1410 Info.memVT =
1412 std::numeric_limits<unsigned>::max());
1413 }
1414
1415 // FIXME: What does alignment mean for an image?
1416 Info.opc = ISD::INTRINSIC_W_CHAIN;
1417 Info.flags = Flags | MachineMemOperand::MOLoad;
1418 } else if (ME.onlyWritesMemory()) {
1419 Info.opc = ISD::INTRINSIC_VOID;
1420
1421 Type *DataTy = CI.getArgOperand(0)->getType();
1422 if (RsrcIntr->IsImage) {
1423 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1424 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1425 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1426 DMaskLanes);
1427 } else
1428 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1429
1430 Info.flags = Flags | MachineMemOperand::MOStore;
1431 } else {
1432 // Atomic, NoReturn Sampler or prefetch
1433 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1435
1436 switch (IntrID) {
1437 default:
1438 Info.flags = Flags | MachineMemOperand::MOLoad;
1439 if (!IsSPrefetch)
1440 Info.flags |= MachineMemOperand::MOStore;
1441
1442 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1443 // Fake memory access type for no return sampler intrinsics
1444 Info.memVT = MVT::i32;
1445 } else {
1446 // XXX - Should this be volatile without known ordering?
1447 Info.flags |= MachineMemOperand::MOVolatile;
1448 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1449 }
1450 break;
1451 case Intrinsic::amdgcn_raw_buffer_load_lds:
1452 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1453 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1454 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1455 case Intrinsic::amdgcn_struct_buffer_load_lds:
1456 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1457 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1458 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1459 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1460
1461 // Entry 0: Load from buffer.
1462 // Don't set an offset, since the pointer value always represents the
1463 // base of the buffer.
1464 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1465 Info.flags = Flags | MachineMemOperand::MOLoad;
1466 Infos.push_back(Info);
1467
1468 // Entry 1: Store to LDS.
1469 // Instruction offset is applied, and an additional per-lane offset
1470 // which we simulate using a larger memory type.
1471 Info.memVT = EVT::getIntegerVT(
1472 CI.getContext(), Width * 8 * Subtarget->getWavefrontSize());
1473 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1474 Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))
1475 ->getZExtValue();
1476 Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
1477 Info.flags = Flags | MachineMemOperand::MOStore;
1478 Infos.push_back(Info);
1479 return;
1480 }
1481 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1482 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1483 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1484 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1485 Info.memVT =
1487 std::numeric_limits<unsigned>::max());
1488 Info.flags = Flags | MachineMemOperand::MOLoad;
1489 Infos.push_back(Info);
1490 return;
1491 }
1492 }
1493 }
1494 Infos.push_back(Info);
1495 return;
1496 }
1497
1498 IntrinsicInfo Info;
1499 switch (IntrID) {
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap: {
1502 Info.opc = ISD::INTRINSIC_W_CHAIN;
1503 Info.memVT = MVT::getVT(CI.getType());
1504 Info.ptrVal = CI.getOperand(0);
1505 Info.align.reset();
1507
1508 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1509 if (!Vol->isZero())
1510 Info.flags |= MachineMemOperand::MOVolatile;
1511
1512 Infos.push_back(Info);
1513 return;
1514 }
1515 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1516 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1517 Info.opc = ISD::INTRINSIC_W_CHAIN;
1518 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1519 Info.ptrVal = nullptr;
1520 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1522 Infos.push_back(Info);
1523 return;
1524 }
1525 case Intrinsic::amdgcn_ds_append:
1526 case Intrinsic::amdgcn_ds_consume: {
1527 Info.opc = ISD::INTRINSIC_W_CHAIN;
1528 Info.memVT = MVT::getVT(CI.getType());
1529 Info.ptrVal = CI.getOperand(0);
1530 Info.align.reset();
1532
1533 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1534 if (!Vol->isZero())
1535 Info.flags |= MachineMemOperand::MOVolatile;
1536
1537 Infos.push_back(Info);
1538 return;
1539 }
1540 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1541 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1542 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1545 Info.memVT = MVT::getVT(CI.getType());
1546 Info.ptrVal = CI.getOperand(0);
1547 Info.memVT = MVT::i64;
1548 Info.size = 8;
1549 Info.align.reset();
1551 Infos.push_back(Info);
1552 return;
1553 }
1554 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1555 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1556 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1557 Info.opc = ISD::INTRINSIC_W_CHAIN;
1558 Info.memVT =
1559 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1560 ? CI.getType()
1562 ->getElementType(0)); // XXX: what is correct VT?
1563
1564 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1565 Info.align.reset();
1566 Info.flags = Flags | MachineMemOperand::MOLoad |
1568 Infos.push_back(Info);
1569 return;
1570 }
1571 case Intrinsic::amdgcn_global_atomic_fmin_num:
1572 case Intrinsic::amdgcn_global_atomic_fmax_num:
1573 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1574 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1575 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1576 Info.opc = ISD::INTRINSIC_W_CHAIN;
1577 Info.memVT = MVT::getVT(CI.getType());
1578 Info.ptrVal = CI.getOperand(0);
1579 Info.align.reset();
1580 Info.flags =
1583 Infos.push_back(Info);
1584 return;
1585 }
1586 case Intrinsic::amdgcn_cluster_load_b32:
1587 case Intrinsic::amdgcn_cluster_load_b64:
1588 case Intrinsic::amdgcn_cluster_load_b128:
1589 case Intrinsic::amdgcn_ds_load_tr6_b96:
1590 case Intrinsic::amdgcn_ds_load_tr4_b64:
1591 case Intrinsic::amdgcn_ds_load_tr8_b64:
1592 case Intrinsic::amdgcn_ds_load_tr16_b128:
1593 case Intrinsic::amdgcn_global_load_tr6_b96:
1594 case Intrinsic::amdgcn_global_load_tr4_b64:
1595 case Intrinsic::amdgcn_global_load_tr_b64:
1596 case Intrinsic::amdgcn_global_load_tr_b128:
1597 case Intrinsic::amdgcn_ds_read_tr4_b64:
1598 case Intrinsic::amdgcn_ds_read_tr6_b96:
1599 case Intrinsic::amdgcn_ds_read_tr8_b64:
1600 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1601 Info.opc = ISD::INTRINSIC_W_CHAIN;
1602 Info.memVT = MVT::getVT(CI.getType());
1603 Info.ptrVal = CI.getOperand(0);
1604 Info.align.reset();
1605 Info.flags = Flags | MachineMemOperand::MOLoad;
1606 Infos.push_back(Info);
1607 return;
1608 }
1609 case Intrinsic::amdgcn_flat_load_monitor_b32:
1610 case Intrinsic::amdgcn_flat_load_monitor_b64:
1611 case Intrinsic::amdgcn_flat_load_monitor_b128:
1612 case Intrinsic::amdgcn_global_load_monitor_b32:
1613 case Intrinsic::amdgcn_global_load_monitor_b64:
1614 case Intrinsic::amdgcn_global_load_monitor_b128: {
1615 Info.opc = ISD::INTRINSIC_W_CHAIN;
1616 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1617 Info.ptrVal = CI.getOperand(0);
1618 Info.align.reset();
1619 Info.flags = MachineMemOperand::MOLoad;
1620 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1621 Info.ssid = parseSyncscopeMDArg(CI, 2);
1622 Infos.push_back(Info);
1623 return;
1624 }
1625 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1626 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1627 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1628 Info.opc = ISD::INTRINSIC_W_CHAIN;
1629 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1630 Info.ptrVal = CI.getOperand(0);
1631 Info.align.reset();
1633 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1634 Info.ssid = parseSyncscopeMDArg(CI, 2);
1635 Infos.push_back(Info);
1636 return;
1637 }
1638 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1639 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1640 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1641 Info.opc = ISD::INTRINSIC_VOID;
1642 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1643 Info.ptrVal = CI.getArgOperand(0);
1644 Info.align.reset();
1646 Info.order = parseAtomicOrderingCABIArg(CI, 2);
1647 Info.ssid = parseSyncscopeMDArg(CI, 3);
1648 Infos.push_back(Info);
1649 return;
1650 }
1651 case Intrinsic::amdgcn_ds_gws_init:
1652 case Intrinsic::amdgcn_ds_gws_barrier:
1653 case Intrinsic::amdgcn_ds_gws_sema_v:
1654 case Intrinsic::amdgcn_ds_gws_sema_br:
1655 case Intrinsic::amdgcn_ds_gws_sema_p:
1656 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1657 Info.opc = ISD::INTRINSIC_VOID;
1658
1659 const GCNTargetMachine &TM =
1660 static_cast<const GCNTargetMachine &>(getTargetMachine());
1661
1663 Info.ptrVal = MFI->getGWSPSV(TM);
1664
1665 // This is an abstract access, but we need to specify a type and size.
1666 Info.memVT = MVT::i32;
1667 Info.size = 4;
1668 Info.align = Align(4);
1669
1670 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1671 Info.flags = Flags | MachineMemOperand::MOLoad;
1672 else
1673 Info.flags = Flags | MachineMemOperand::MOStore;
1674 Infos.push_back(Info);
1675 return;
1676 }
1677 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1678 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1679 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1680 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1681 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1682 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1683 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1684 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1685 // Entry 0: Load from source (global/flat).
1686 Info.opc = ISD::INTRINSIC_VOID;
1687 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1688 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1689 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1690 Info.flags = Flags | MachineMemOperand::MOLoad;
1691 Infos.push_back(Info);
1692
1693 // Entry 1: Store to LDS (same offset).
1694 Info.flags = Flags | MachineMemOperand::MOStore;
1695 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1696 Infos.push_back(Info);
1697 return;
1698 }
1699 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1700 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1701 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1702 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1703 // Entry 0: Load from LDS.
1704 Info.opc = ISD::INTRINSIC_VOID;
1705 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1706 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1707 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1708 Info.flags = Flags | MachineMemOperand::MOLoad;
1709 Infos.push_back(Info);
1710
1711 // Entry 1: Store to global (same offset).
1712 Info.flags = Flags | MachineMemOperand::MOStore;
1713 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1714 Infos.push_back(Info);
1715 return;
1716 }
1717 case Intrinsic::amdgcn_load_to_lds:
1718 case Intrinsic::amdgcn_load_async_to_lds:
1719 case Intrinsic::amdgcn_global_load_lds:
1720 case Intrinsic::amdgcn_global_load_async_lds: {
1721 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1722 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1723 bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
1724 if (IsVolatile)
1726
1727 // Entry 0: Load from source (global/flat).
1728 Info.opc = ISD::INTRINSIC_VOID;
1729 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1730 Info.ptrVal = CI.getArgOperand(0); // Source pointer
1731 Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
1732 Info.flags = Flags | MachineMemOperand::MOLoad;
1733 Infos.push_back(Info);
1734
1735 // Entry 1: Store to LDS.
1736 // Same offset from the instruction, but an additional per-lane offset is
1737 // added. Represent that using a wider memory type.
1738 Info.memVT = EVT::getIntegerVT(CI.getContext(),
1739 Width * 8 * Subtarget->getWavefrontSize());
1740 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1741 Info.flags = Flags | MachineMemOperand::MOStore;
1742 Infos.push_back(Info);
1743 return;
1744 }
1745 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1746 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1747 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1748 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1749 Info.opc = ISD::INTRINSIC_W_CHAIN;
1750
1751 const GCNTargetMachine &TM =
1752 static_cast<const GCNTargetMachine &>(getTargetMachine());
1753
1755 Info.ptrVal = MFI->getGWSPSV(TM);
1756
1757 // This is an abstract access, but we need to specify a type and size.
1758 Info.memVT = MVT::i32;
1759 Info.size = 4;
1760 Info.align = Align(4);
1761
1763 Infos.push_back(Info);
1764 return;
1765 }
1766 case Intrinsic::amdgcn_s_prefetch_data:
1767 case Intrinsic::amdgcn_flat_prefetch:
1768 case Intrinsic::amdgcn_global_prefetch: {
1769 Info.opc = ISD::INTRINSIC_VOID;
1770 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1771 Info.ptrVal = CI.getArgOperand(0);
1772 Info.flags = Flags | MachineMemOperand::MOLoad;
1773 Infos.push_back(Info);
1774 return;
1775 }
1776 default:
1777 return;
1778 }
1779}
1780
1782 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1784 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1785 // The DAG's ValueType loses the addrspaces.
1786 // Add them as 2 extra Constant operands "from" and "to".
1787 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1788 unsigned DstAS = I.getType()->getPointerAddressSpace();
1789 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1790 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1791 break;
1792 }
1793 default:
1794 break;
1795 }
1796}
1797
1800 Type *&AccessTy) const {
1801 Value *Ptr = nullptr;
1802 switch (II->getIntrinsicID()) {
1803 case Intrinsic::amdgcn_cluster_load_b128:
1804 case Intrinsic::amdgcn_cluster_load_b64:
1805 case Intrinsic::amdgcn_cluster_load_b32:
1806 case Intrinsic::amdgcn_ds_append:
1807 case Intrinsic::amdgcn_ds_consume:
1808 case Intrinsic::amdgcn_ds_load_tr8_b64:
1809 case Intrinsic::amdgcn_ds_load_tr16_b128:
1810 case Intrinsic::amdgcn_ds_load_tr4_b64:
1811 case Intrinsic::amdgcn_ds_load_tr6_b96:
1812 case Intrinsic::amdgcn_ds_read_tr4_b64:
1813 case Intrinsic::amdgcn_ds_read_tr6_b96:
1814 case Intrinsic::amdgcn_ds_read_tr8_b64:
1815 case Intrinsic::amdgcn_ds_read_tr16_b64:
1816 case Intrinsic::amdgcn_ds_ordered_add:
1817 case Intrinsic::amdgcn_ds_ordered_swap:
1818 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1819 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1820 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1821 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1822 case Intrinsic::amdgcn_global_atomic_fmax_num:
1823 case Intrinsic::amdgcn_global_atomic_fmin_num:
1824 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1825 case Intrinsic::amdgcn_global_load_tr_b64:
1826 case Intrinsic::amdgcn_global_load_tr_b128:
1827 case Intrinsic::amdgcn_global_load_tr4_b64:
1828 case Intrinsic::amdgcn_global_load_tr6_b96:
1829 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1830 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1831 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1832 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1833 Ptr = II->getArgOperand(0);
1834 break;
1835 case Intrinsic::amdgcn_load_to_lds:
1836 case Intrinsic::amdgcn_load_async_to_lds:
1837 case Intrinsic::amdgcn_global_load_lds:
1838 case Intrinsic::amdgcn_global_load_async_lds:
1839 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1840 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1841 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1842 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1843 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1844 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1845 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1846 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1847 Ptr = II->getArgOperand(1);
1848 break;
1849 default:
1850 return false;
1851 }
1852 AccessTy = II->getType();
1853 Ops.push_back(Ptr);
1854 return true;
1855}
1856
1858 unsigned AddrSpace) const {
1859 if (!Subtarget->hasFlatInstOffsets()) {
1860 // Flat instructions do not have offsets, and only have the register
1861 // address.
1862 return AM.BaseOffs == 0 && AM.Scale == 0;
1863 }
1864
1865 decltype(SIInstrFlags::FLAT) FlatVariant =
1869
1870 return AM.Scale == 0 &&
1871 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1872 AM.BaseOffs, AddrSpace, FlatVariant));
1873}
1874
1876 if (Subtarget->hasFlatGlobalInsts())
1878
1879 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1880 // Assume the we will use FLAT for all global memory accesses
1881 // on VI.
1882 // FIXME: This assumption is currently wrong. On VI we still use
1883 // MUBUF instructions for the r + i addressing mode. As currently
1884 // implemented, the MUBUF instructions only work on buffer < 4GB.
1885 // It may be possible to support > 4GB buffers with MUBUF instructions,
1886 // by setting the stride value in the resource descriptor which would
1887 // increase the size limit to (stride * 4GB). However, this is risky,
1888 // because it has never been validated.
1890 }
1891
1892 return isLegalMUBUFAddressingMode(AM);
1893}
1894
1895bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1896 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1897 // additionally can do r + r + i with addr64. 32-bit has more addressing
1898 // mode options. Depending on the resource constant, it can also do
1899 // (i64 r0) + (i32 r1) * (i14 i).
1900 //
1901 // Private arrays end up using a scratch buffer most of the time, so also
1902 // assume those use MUBUF instructions. Scratch loads / stores are currently
1903 // implemented as mubuf instructions with offen bit set, so slightly
1904 // different than the normal addr64.
1905 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1906 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1907 return false;
1908
1909 // FIXME: Since we can split immediate into soffset and immediate offset,
1910 // would it make sense to allow any immediate?
1911
1912 switch (AM.Scale) {
1913 case 0: // r + i or just i, depending on HasBaseReg.
1914 return true;
1915 case 1:
1916 return true; // We have r + r or r + i.
1917 case 2:
1918 if (AM.HasBaseReg) {
1919 // Reject 2 * r + r.
1920 return false;
1921 }
1922
1923 // Allow 2 * r as r + r
1924 // Or 2 * r + i is allowed as r + r + i.
1925 return true;
1926 default: // Don't allow n * r
1927 return false;
1928 }
1929}
1930
1932 const AddrMode &AM, Type *Ty,
1933 unsigned AS,
1934 Instruction *I) const {
1935 // No global is ever allowed as a base.
1936 if (AM.BaseGV)
1937 return false;
1938
1939 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1940 return isLegalGlobalAddressingMode(AM);
1941
1942 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1946 // If the offset isn't a multiple of 4, it probably isn't going to be
1947 // correctly aligned.
1948 // FIXME: Can we get the real alignment here?
1949 if (AM.BaseOffs % 4 != 0)
1950 return isLegalMUBUFAddressingMode(AM);
1951
1952 if (!Subtarget->hasScalarSubwordLoads()) {
1953 // There are no SMRD extloads, so if we have to do a small type access we
1954 // will use a MUBUF load.
1955 // FIXME?: We also need to do this if unaligned, but we don't know the
1956 // alignment here.
1957 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1958 return isLegalGlobalAddressingMode(AM);
1959 }
1960
1961 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1962 // SMRD instructions have an 8-bit, dword offset on SI.
1963 if (!isUInt<8>(AM.BaseOffs / 4))
1964 return false;
1965 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1966 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1967 // in 8-bits, it can use a smaller encoding.
1968 if (!isUInt<32>(AM.BaseOffs / 4))
1969 return false;
1970 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1971 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1972 if (!isUInt<20>(AM.BaseOffs))
1973 return false;
1974 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1975 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1976 // for S_BUFFER_* instructions).
1977 if (!isInt<21>(AM.BaseOffs))
1978 return false;
1979 } else {
1980 // On GFX12, all offsets are signed 24-bit in bytes.
1981 if (!isInt<24>(AM.BaseOffs))
1982 return false;
1983 }
1984
1985 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1987 AM.BaseOffs < 0) {
1988 // Scalar (non-buffer) loads can only use a negative offset if
1989 // soffset+offset is non-negative. Since the compiler can only prove that
1990 // in a few special cases, it is safer to claim that negative offsets are
1991 // not supported.
1992 return false;
1993 }
1994
1995 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1996 return true;
1997
1998 if (AM.Scale == 1 && AM.HasBaseReg)
1999 return true;
2000
2001 return false;
2002 }
2003
2004 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
2005 return Subtarget->hasFlatScratchEnabled()
2007 : isLegalMUBUFAddressingMode(AM);
2008
2009 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
2010 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
2011 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
2012 // field.
2013 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
2014 // an 8-bit dword offset but we don't know the alignment here.
2015 if (!isUInt<16>(AM.BaseOffs))
2016 return false;
2017
2018 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2019 return true;
2020
2021 if (AM.Scale == 1 && AM.HasBaseReg)
2022 return true;
2023
2024 return false;
2025 }
2026
2028 // For an unknown address space, this usually means that this is for some
2029 // reason being used for pure arithmetic, and not based on some addressing
2030 // computation. We don't have instructions that compute pointers with any
2031 // addressing modes, so treat them as having no offset like flat
2032 // instructions.
2034 }
2035
2036 // Assume a user alias of global for unknown address spaces.
2037 return isLegalGlobalAddressingMode(AM);
2038}
2039
2041 const MachineFunction &MF) const {
2043 return (MemVT.getSizeInBits() <= 4 * 32);
2044 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
2045 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
2046 return (MemVT.getSizeInBits() <= MaxPrivateBits);
2047 }
2049 return (MemVT.getSizeInBits() <= 2 * 32);
2050 return true;
2051}
2052
2054 unsigned Size, unsigned AddrSpace, Align Alignment,
2055 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
2056 if (IsFast)
2057 *IsFast = 0;
2058
2059 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
2060 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2061 // Check if alignment requirements for ds_read/write instructions are
2062 // disabled.
2063 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
2064 return false;
2065
2066 Align RequiredAlignment(
2067 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
2068 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
2069 Alignment < RequiredAlignment)
2070 return false;
2071
2072 // Either, the alignment requirements are "enabled", or there is an
2073 // unaligned LDS access related hardware bug though alignment requirements
2074 // are "disabled". In either case, we need to check for proper alignment
2075 // requirements.
2076 //
2077 switch (Size) {
2078 case 64:
2079 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2080 // address is negative, then the instruction is incorrectly treated as
2081 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2082 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2083 // load later in the SILoadStoreOptimizer.
2084 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2085 return false;
2086
2087 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2088 // can do a 4 byte aligned, 8 byte access in a single operation using
2089 // ds_read2/write2_b32 with adjacent offsets.
2090 RequiredAlignment = Align(4);
2091
2092 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2093 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2094 // ds_write2_b32 depending on the alignment. In either case with either
2095 // alignment there is no faster way of doing this.
2096
2097 // The numbers returned here and below are not additive, it is a 'speed
2098 // rank'. They are just meant to be compared to decide if a certain way
2099 // of lowering an operation is faster than another. For that purpose
2100 // naturally aligned operation gets it bitsize to indicate that "it
2101 // operates with a speed comparable to N-bit wide load". With the full
2102 // alignment ds128 is slower than ds96 for example. If underaligned it
2103 // is comparable to a speed of a single dword access, which would then
2104 // mean 32 < 128 and it is faster to issue a wide load regardless.
2105 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2106 // wider load which will not be aligned anymore the latter is slower.
2107 if (IsFast)
2108 *IsFast = (Alignment >= RequiredAlignment) ? 64
2109 : (Alignment < Align(4)) ? 32
2110 : 1;
2111 return true;
2112 }
2113
2114 break;
2115 case 96:
2116 if (!Subtarget->hasDS96AndDS128())
2117 return false;
2118
2119 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2120 // gfx8 and older.
2121
2122 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2123 // Naturally aligned access is fastest. However, also report it is Fast
2124 // if memory is aligned less than DWORD. A narrow load or store will be
2125 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2126 // be more of them, so overall we will pay less penalty issuing a single
2127 // instruction.
2128
2129 // See comment on the values above.
2130 if (IsFast)
2131 *IsFast = (Alignment >= RequiredAlignment) ? 96
2132 : (Alignment < Align(4)) ? 32
2133 : 1;
2134 return true;
2135 }
2136
2137 break;
2138 case 128:
2139 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2140 return false;
2141
2142 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2143 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2144 // single operation using ds_read2/write2_b64.
2145 RequiredAlignment = Align(8);
2146
2147 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2148 // Naturally aligned access is fastest. However, also report it is Fast
2149 // if memory is aligned less than DWORD. A narrow load or store will be
2150 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2151 // will be more of them, so overall we will pay less penalty issuing a
2152 // single instruction.
2153
2154 // See comment on the values above.
2155 if (IsFast)
2156 *IsFast = (Alignment >= RequiredAlignment) ? 128
2157 : (Alignment < Align(4)) ? 32
2158 : 1;
2159 return true;
2160 }
2161
2162 break;
2163 default:
2164 if (Size > 32)
2165 return false;
2166
2167 break;
2168 }
2169
2170 // See comment on the values above.
2171 // Note that we have a single-dword or sub-dword here, so if underaligned
2172 // it is a slowest possible access, hence returned value is 0.
2173 if (IsFast)
2174 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2175
2176 return Alignment >= RequiredAlignment ||
2177 Subtarget->hasUnalignedDSAccessEnabled();
2178 }
2179
2180 // FIXME: We have to be conservative here and assume that flat operations
2181 // will access scratch. If we had access to the IR function, then we
2182 // could determine if any private memory was used in the function.
2183 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2184 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2185 bool AlignedBy4 = Alignment >= Align(4);
2186 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2187 if (IsFast)
2188 *IsFast = AlignedBy4 ? Size : 1;
2189 return true;
2190 }
2191
2192 if (IsFast)
2193 *IsFast = AlignedBy4;
2194
2195 return AlignedBy4;
2196 }
2197
2198 // So long as they are correct, wide global memory operations perform better
2199 // than multiple smaller memory ops -- even when misaligned
2200 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2201 if (IsFast)
2202 *IsFast = Size;
2203
2204 return Alignment >= Align(4) ||
2205 Subtarget->hasUnalignedBufferAccessEnabled();
2206 }
2207
2208 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2209 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2210 // out-of-bounds behavior, but in the edge case where an access starts
2211 // out-of-bounds and then enter in-bounds, the entire access would be treated
2212 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2213 // natural alignment of buffer accesses.
2214 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2215 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2216 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2217 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2218 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2219 return false;
2220 }
2221
2222 // Smaller than dword value must be aligned.
2223 if (Size < 32)
2224 return false;
2225
2226 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2227 // byte-address are ignored, thus forcing Dword alignment.
2228 // This applies to private, global, and constant memory.
2229 if (IsFast)
2230 *IsFast = 1;
2231
2232 return Size >= 32 && Alignment >= Align(4);
2233}
2234
2236 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2237 unsigned *IsFast) const {
2239 Alignment, Flags, IsFast);
2240}
2241
2243 LLVMContext &Context, const MemOp &Op,
2244 const AttributeList &FuncAttributes) const {
2245 // FIXME: Should account for address space here.
2246
2247 // The default fallback uses the private pointer size as a guess for a type to
2248 // use. Make sure we switch these to 64-bit accesses.
2249
2250 if (Op.size() >= 16 &&
2251 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2252 return MVT::v4i32;
2253
2254 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2255 return MVT::v2i32;
2256
2257 // Use the default.
2258 return MVT::Other;
2259}
2260
2262 const MemSDNode *MemNode = cast<MemSDNode>(N);
2263 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2264}
2265
2270
2272 unsigned DestAS) const {
2273 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2274 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2275 Subtarget->hasGloballyAddressableScratch()) {
2276 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2277 return false;
2278 }
2279
2280 // Flat -> private/local is a simple truncate.
2281 // Flat -> global is no-op
2282 return true;
2283 }
2284
2285 const GCNTargetMachine &TM =
2286 static_cast<const GCNTargetMachine &>(getTargetMachine());
2287 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2288}
2289
2297
2299 Type *Ty) const {
2300 // FIXME: Could be smarter if called for vector constants.
2301 return true;
2302}
2303
2305 unsigned Index) const {
2307 return false;
2308
2309 // TODO: Add more cases that are cheap.
2310 return Index == 0;
2311}
2312
2313bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2314 // TODO: This should be more aggressive, particular for 16-bit element
2315 // vectors. However there are some mixed improvements and regressions.
2316 EVT EltTy = VT.getVectorElementType();
2317 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2318 return EltTy.getSizeInBits() % MinAlign == 0;
2319}
2320
2322 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2323 switch (Op) {
2324 case ISD::LOAD:
2325 case ISD::STORE:
2326 return true;
2327 default:
2328 return false;
2329 }
2330 }
2331
2332 // SimplifySetCC uses this function to determine whether or not it should
2333 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2334 if (VT == MVT::i1 && Op == ISD::SETCC)
2335 return false;
2336
2338}
2339
2342 // This isn't really a constant pool but close enough.
2345 return PtrInfo;
2346}
2347
2348SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2349 const SDLoc &SL,
2350 SDValue Chain,
2351 uint64_t Offset) const {
2352 const DataLayout &DL = DAG.getDataLayout();
2356
2357 auto [InputPtrReg, RC, ArgTy] =
2358 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2359
2360 // We may not have the kernarg segment argument if we have no kernel
2361 // arguments.
2362 if (!InputPtrReg)
2363 return DAG.getConstant(Offset, SL, PtrVT);
2364
2366 SDValue BasePtr = DAG.getCopyFromReg(
2367 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2368
2369 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2370}
2371
2372SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2373 const SDLoc &SL) const {
2376 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2377}
2378
2379SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2380 const SDLoc &SL) const {
2381
2383 std::optional<uint32_t> KnownSize =
2385 if (KnownSize.has_value())
2386 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2387 return SDValue();
2388}
2389
2390SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2391 const SDLoc &SL, SDValue Val,
2392 bool Signed,
2393 const ISD::InputArg *Arg) const {
2394 // First, if it is a widened vector, narrow it.
2395 if (VT.isVector() &&
2397 EVT NarrowedVT =
2400 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2401 DAG.getConstant(0, SL, MVT::i32));
2402 }
2403
2404 // Then convert the vector elements or scalar value.
2405 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2406 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2407 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2408 }
2409
2410 if (MemVT.isFloatingPoint()) {
2411 if (VT.isFloatingPoint()) {
2412 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2413 } else {
2414 assert(!MemVT.isVector());
2415 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
2416 SDValue Cast = DAG.getBitcast(IntVT, Val);
2417 Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);
2418 }
2419 } else if (Signed)
2420 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2421 else
2422 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2423
2424 return Val;
2425}
2426
2427SDValue SITargetLowering::lowerKernargMemParameter(
2428 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2429 uint64_t Offset, Align Alignment, bool Signed,
2430 const ISD::InputArg *Arg) const {
2431
2432 MachinePointerInfo PtrInfo =
2434
2435 // Try to avoid using an extload by loading earlier than the argument address,
2436 // and extracting the relevant bits. The load should hopefully be merged with
2437 // the previous argument.
2438 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2439 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2440 int64_t AlignDownOffset = alignDown(Offset, 4);
2441 int64_t OffsetDiff = Offset - AlignDownOffset;
2442
2443 EVT IntVT = MemVT.changeTypeToInteger();
2444
2445 // TODO: If we passed in the base kernel offset we could have a better
2446 // alignment than 4, but we don't really need it.
2447 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2448 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2449 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2452
2453 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2454 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2455
2456 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2457 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2458 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2459
2460 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2461 }
2462
2463 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2464 SDValue Load = DAG.getLoad(
2465 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2467
2468 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2469 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2470}
2471
2472/// Coerce an argument which was passed in a different ABI type to the original
2473/// expected value type.
2474SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2475 SDValue Val,
2476 CCValAssign &VA,
2477 const SDLoc &SL) const {
2478 EVT ValVT = VA.getValVT();
2479
2480 // If this is an 8 or 16-bit value, it is really passed promoted
2481 // to 32 bits. Insert an assert[sz]ext to capture this, then
2482 // truncate to the right size.
2483 switch (VA.getLocInfo()) {
2484 case CCValAssign::Full:
2485 return Val;
2486 case CCValAssign::BCvt:
2487 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2488 case CCValAssign::SExt:
2489 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2490 DAG.getValueType(ValVT));
2491 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2492 case CCValAssign::ZExt:
2493 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2494 DAG.getValueType(ValVT));
2495 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2496 case CCValAssign::AExt:
2497 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2498 default:
2499 llvm_unreachable("Unknown loc info!");
2500 }
2501}
2502
2503SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2504 CCValAssign &VA, const SDLoc &SL,
2505 SDValue Chain,
2506 const ISD::InputArg &Arg) const {
2507 MachineFunction &MF = DAG.getMachineFunction();
2508 MachineFrameInfo &MFI = MF.getFrameInfo();
2509
2510 if (Arg.Flags.isByVal()) {
2511 unsigned Size = Arg.Flags.getByValSize();
2512 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2513 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2514 }
2515
2516 unsigned ArgOffset = VA.getLocMemOffset();
2517 unsigned ArgSize = VA.getValVT().getStoreSize();
2518
2519 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2520
2521 // Create load nodes to retrieve arguments from the stack.
2522 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2523
2524 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2526 MVT MemVT = VA.getValVT();
2527
2528 switch (VA.getLocInfo()) {
2529 default:
2530 break;
2531 case CCValAssign::BCvt:
2532 MemVT = VA.getLocVT();
2533 break;
2534 case CCValAssign::SExt:
2535 ExtType = ISD::SEXTLOAD;
2536 break;
2537 case CCValAssign::ZExt:
2538 ExtType = ISD::ZEXTLOAD;
2539 break;
2540 case CCValAssign::AExt:
2541 ExtType = ISD::EXTLOAD;
2542 break;
2543 }
2544
2545 SDValue ArgValue = DAG.getExtLoad(
2546 ExtType, SL, VA.getLocVT(), Chain, FIN,
2548
2549 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2550 if (ConvertedVal == ArgValue)
2551 return ConvertedVal;
2552
2553 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2554}
2555
2556SDValue SITargetLowering::lowerWorkGroupId(
2557 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2560 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2561 if (!Subtarget->hasClusters())
2562 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2563
2564 // Clusters are supported. Return the global position in the grid. If clusters
2565 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2566
2567 // WorkGroupIdXYZ = ClusterId == 0 ?
2568 // ClusterIdXYZ :
2569 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2570 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2571 SDLoc SL(ClusterIdXYZ);
2572 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2573 SDValue One = DAG.getConstant(1, SL, VT);
2574 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2575 SDValue ClusterWorkGroupIdXYZ =
2576 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2577 SDValue GlobalIdXYZ =
2578 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2579 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2580
2581 switch (MFI.getClusterDims().getKind()) {
2584 return GlobalIdXYZ;
2586 return ClusterIdXYZ;
2588 using namespace AMDGPU::Hwreg;
2589 SDValue ClusterIdField =
2590 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2591 SDNode *GetReg =
2592 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2593 SDValue ClusterId(GetReg, 0);
2594 SDValue Zero = DAG.getConstant(0, SL, VT);
2595 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2596 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2597 }
2598 }
2599
2600 llvm_unreachable("nothing should reach here");
2601}
2602
2603SDValue SITargetLowering::getPreloadedValue(
2604 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2606 const ArgDescriptor *Reg = nullptr;
2607 const TargetRegisterClass *RC;
2608 LLT Ty;
2609
2611 const ArgDescriptor WorkGroupIDX =
2612 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2613 // If GridZ is not programmed in an entry function then the hardware will set
2614 // it to all zeros, so there is no need to mask the GridY value in the low
2615 // order bits.
2616 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2617 AMDGPU::TTMP7,
2618 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2619 const ArgDescriptor WorkGroupIDZ =
2620 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2621 const ArgDescriptor ClusterWorkGroupIDX =
2622 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2623 const ArgDescriptor ClusterWorkGroupIDY =
2624 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2625 const ArgDescriptor ClusterWorkGroupIDZ =
2626 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2627 const ArgDescriptor ClusterWorkGroupMaxIDX =
2628 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2629 const ArgDescriptor ClusterWorkGroupMaxIDY =
2630 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2631 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2632 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2633 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2634 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2635
2636 auto LoadConstant = [&](unsigned N) {
2637 return DAG.getConstant(N, SDLoc(), VT);
2638 };
2639
2640 if (Subtarget->hasArchitectedSGPRs() &&
2642 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2643 bool HasFixedDims = ClusterDims.isFixedDims();
2644
2645 switch (PVID) {
2647 Reg = &WorkGroupIDX;
2648 RC = &AMDGPU::SReg_32RegClass;
2649 Ty = LLT::scalar(32);
2650 break;
2652 Reg = &WorkGroupIDY;
2653 RC = &AMDGPU::SReg_32RegClass;
2654 Ty = LLT::scalar(32);
2655 break;
2657 Reg = &WorkGroupIDZ;
2658 RC = &AMDGPU::SReg_32RegClass;
2659 Ty = LLT::scalar(32);
2660 break;
2662 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2663 return LoadConstant(0);
2664 Reg = &ClusterWorkGroupIDX;
2665 RC = &AMDGPU::SReg_32RegClass;
2666 Ty = LLT::scalar(32);
2667 break;
2669 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2670 return LoadConstant(0);
2671 Reg = &ClusterWorkGroupIDY;
2672 RC = &AMDGPU::SReg_32RegClass;
2673 Ty = LLT::scalar(32);
2674 break;
2676 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2677 return LoadConstant(0);
2678 Reg = &ClusterWorkGroupIDZ;
2679 RC = &AMDGPU::SReg_32RegClass;
2680 Ty = LLT::scalar(32);
2681 break;
2683 if (HasFixedDims)
2684 return LoadConstant(ClusterDims.getDims()[0] - 1);
2685 Reg = &ClusterWorkGroupMaxIDX;
2686 RC = &AMDGPU::SReg_32RegClass;
2687 Ty = LLT::scalar(32);
2688 break;
2690 if (HasFixedDims)
2691 return LoadConstant(ClusterDims.getDims()[1] - 1);
2692 Reg = &ClusterWorkGroupMaxIDY;
2693 RC = &AMDGPU::SReg_32RegClass;
2694 Ty = LLT::scalar(32);
2695 break;
2697 if (HasFixedDims)
2698 return LoadConstant(ClusterDims.getDims()[2] - 1);
2699 Reg = &ClusterWorkGroupMaxIDZ;
2700 RC = &AMDGPU::SReg_32RegClass;
2701 Ty = LLT::scalar(32);
2702 break;
2704 Reg = &ClusterWorkGroupMaxFlatID;
2705 RC = &AMDGPU::SReg_32RegClass;
2706 Ty = LLT::scalar(32);
2707 break;
2708 default:
2709 break;
2710 }
2711 }
2712
2713 if (!Reg)
2714 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2715 if (!Reg) {
2717 // It's possible for a kernarg intrinsic call to appear in a kernel with
2718 // no allocated segment, in which case we do not add the user sgpr
2719 // argument, so just return null.
2720 return DAG.getConstant(0, SDLoc(), VT);
2721 }
2722
2723 // It's undefined behavior if a function marked with the amdgpu-no-*
2724 // attributes uses the corresponding intrinsic.
2725 return DAG.getPOISON(VT);
2726 }
2727
2728 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2729}
2730
2732 CallingConv::ID CallConv,
2733 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2734 FunctionType *FType,
2735 SIMachineFunctionInfo *Info) {
2736 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2737 const ISD::InputArg *Arg = &Ins[I];
2738
2739 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2740 "vector type argument should have been split");
2741
2742 // First check if it's a PS input addr.
2743 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2744 PSInputNum <= 15) {
2745 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2746
2747 // Inconveniently only the first part of the split is marked as isSplit,
2748 // so skip to the end. We only want to increment PSInputNum once for the
2749 // entire split argument.
2750 if (Arg->Flags.isSplit()) {
2751 while (!Arg->Flags.isSplitEnd()) {
2752 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2753 "unexpected vector split in ps argument type");
2754 if (!SkipArg)
2755 Splits.push_back(*Arg);
2756 Arg = &Ins[++I];
2757 }
2758 }
2759
2760 if (SkipArg) {
2761 // We can safely skip PS inputs.
2762 Skipped.set(Arg->getOrigArgIndex());
2763 ++PSInputNum;
2764 continue;
2765 }
2766
2767 Info->markPSInputAllocated(PSInputNum);
2768 if (Arg->Used)
2769 Info->markPSInputEnabled(PSInputNum);
2770
2771 ++PSInputNum;
2772 }
2773
2774 Splits.push_back(*Arg);
2775 }
2776}
2777
2778// Allocate special inputs passed in VGPRs.
2780 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2781 SIMachineFunctionInfo &Info) const {
2782 const LLT S32 = LLT::scalar(32);
2784
2785 if (Info.hasWorkItemIDX()) {
2786 Register Reg = AMDGPU::VGPR0;
2787 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2788
2789 CCInfo.AllocateReg(Reg);
2790 unsigned Mask =
2791 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2792 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2793 }
2794
2795 if (Info.hasWorkItemIDY()) {
2796 assert(Info.hasWorkItemIDX());
2797 if (Subtarget->hasPackedTID()) {
2798 Info.setWorkItemIDY(
2799 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2800 } else {
2801 unsigned Reg = AMDGPU::VGPR1;
2802 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2803
2804 CCInfo.AllocateReg(Reg);
2805 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2806 }
2807 }
2808
2809 if (Info.hasWorkItemIDZ()) {
2810 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2811 if (Subtarget->hasPackedTID()) {
2812 Info.setWorkItemIDZ(
2813 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2814 } else {
2815 unsigned Reg = AMDGPU::VGPR2;
2816 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2817
2818 CCInfo.AllocateReg(Reg);
2819 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2820 }
2821 }
2822}
2823
2824// Try to allocate a VGPR at the end of the argument list, or if no argument
2825// VGPRs are left allocating a stack slot.
2826// If \p Mask is is given it indicates bitfield position in the register.
2827// If \p Arg is given use it with new ]p Mask instead of allocating new.
2828static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2829 ArgDescriptor Arg = ArgDescriptor()) {
2830 if (Arg.isSet())
2831 return ArgDescriptor::createArg(Arg, Mask);
2832
2833 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2834 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2835 if (RegIdx == ArgVGPRs.size()) {
2836 // Spill to stack required.
2837 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2838
2839 return ArgDescriptor::createStack(Offset, Mask);
2840 }
2841
2842 unsigned Reg = ArgVGPRs[RegIdx];
2843 Reg = CCInfo.AllocateReg(Reg);
2844 assert(Reg != AMDGPU::NoRegister);
2845
2846 MachineFunction &MF = CCInfo.getMachineFunction();
2847 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2848 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2849 return ArgDescriptor::createRegister(Reg, Mask);
2850}
2851
2853 const TargetRegisterClass *RC,
2854 unsigned NumArgRegs) {
2855 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2856 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2857 if (RegIdx == ArgSGPRs.size())
2858 report_fatal_error("ran out of SGPRs for arguments");
2859
2860 unsigned Reg = ArgSGPRs[RegIdx];
2861 Reg = CCInfo.AllocateReg(Reg);
2862 assert(Reg != AMDGPU::NoRegister);
2863
2864 MachineFunction &MF = CCInfo.getMachineFunction();
2865 MF.addLiveIn(Reg, RC);
2867}
2868
2869// If this has a fixed position, we still should allocate the register in the
2870// CCInfo state. Technically we could get away with this for values passed
2871// outside of the normal argument range.
2873 const TargetRegisterClass *RC,
2874 MCRegister Reg) {
2875 Reg = CCInfo.AllocateReg(Reg);
2876 assert(Reg != AMDGPU::NoRegister);
2877 MachineFunction &MF = CCInfo.getMachineFunction();
2878 MF.addLiveIn(Reg, RC);
2879}
2880
2881static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2882 if (Arg) {
2883 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2884 Arg.getRegister());
2885 } else
2886 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2887}
2888
2889static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2890 if (Arg) {
2891 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2892 Arg.getRegister());
2893 } else
2894 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2895}
2896
2897/// Allocate implicit function VGPR arguments at the end of allocated user
2898/// arguments.
2900 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2901 SIMachineFunctionInfo &Info) const {
2902 const unsigned Mask = 0x3ff;
2903 ArgDescriptor Arg;
2904
2905 if (Info.hasWorkItemIDX()) {
2906 Arg = allocateVGPR32Input(CCInfo, Mask);
2907 Info.setWorkItemIDX(Arg);
2908 }
2909
2910 if (Info.hasWorkItemIDY()) {
2911 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2912 Info.setWorkItemIDY(Arg);
2913 }
2914
2915 if (Info.hasWorkItemIDZ())
2916 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2917}
2918
2919/// Allocate implicit function VGPR arguments in fixed registers.
2921 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2922 SIMachineFunctionInfo &Info) const {
2923 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2924 if (!Reg)
2925 report_fatal_error("failed to allocate VGPR for implicit arguments");
2926
2927 const unsigned Mask = 0x3ff;
2928 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2929 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2930 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2931}
2932
2934 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2935 SIMachineFunctionInfo &Info) const {
2936 auto &ArgInfo = Info.getArgInfo();
2937 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2938
2939 // TODO: Unify handling with private memory pointers.
2940 if (UserSGPRInfo.hasDispatchPtr())
2941 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2942
2943 if (UserSGPRInfo.hasQueuePtr())
2944 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2945
2946 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2947 // constant offset from the kernarg segment.
2948 if (Info.hasImplicitArgPtr())
2949 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2950
2951 if (UserSGPRInfo.hasDispatchID())
2952 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2953
2954 // flat_scratch_init is not applicable for non-kernel functions.
2955
2956 if (Info.hasWorkGroupIDX())
2957 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2958
2959 if (Info.hasWorkGroupIDY())
2960 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2961
2962 if (Info.hasWorkGroupIDZ())
2963 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2964
2965 if (Info.hasLDSKernelId())
2966 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2967}
2968
2969// Allocate special inputs passed in user SGPRs.
2971 MachineFunction &MF,
2972 const SIRegisterInfo &TRI,
2973 SIMachineFunctionInfo &Info) const {
2974 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2975 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2976 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2977 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2978 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2979 }
2980
2981 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2982 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2983 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2984 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2985 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2986 }
2987
2988 if (UserSGPRInfo.hasDispatchPtr()) {
2989 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2990 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2991 CCInfo.AllocateReg(DispatchPtrReg);
2992 }
2993
2994 if (UserSGPRInfo.hasQueuePtr()) {
2995 Register QueuePtrReg = Info.addQueuePtr(TRI);
2996 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2997 CCInfo.AllocateReg(QueuePtrReg);
2998 }
2999
3000 if (UserSGPRInfo.hasKernargSegmentPtr()) {
3002 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
3003 CCInfo.AllocateReg(InputPtrReg);
3004
3005 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
3006 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
3007 }
3008
3009 if (UserSGPRInfo.hasDispatchID()) {
3010 Register DispatchIDReg = Info.addDispatchID(TRI);
3011 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3012 CCInfo.AllocateReg(DispatchIDReg);
3013 }
3014
3015 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
3016 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
3017 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3018 CCInfo.AllocateReg(FlatScratchInitReg);
3019 }
3020
3021 if (UserSGPRInfo.hasPrivateSegmentSize()) {
3022 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
3023 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3024 CCInfo.AllocateReg(PrivateSegmentSizeReg);
3025 }
3026
3027 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
3028 // these from the dispatch pointer.
3029}
3030
3031// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
3032// sequential starting from the first argument.
3034 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
3036 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
3037 Function &F = MF.getFunction();
3038 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3039 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
3040 bool InPreloadSequence = true;
3041 unsigned InIdx = 0;
3042 bool AlignedForImplictArgs = false;
3043 unsigned ImplicitArgOffset = 0;
3044 for (auto &Arg : F.args()) {
3045 if (!InPreloadSequence || !Arg.hasInRegAttr())
3046 break;
3047
3048 unsigned ArgIdx = Arg.getArgNo();
3049 // Don't preload non-original args or parts not in the current preload
3050 // sequence.
3051 if (InIdx < Ins.size() &&
3052 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3053 break;
3054
3055 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
3056 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3057 InIdx++) {
3058 assert(ArgLocs[ArgIdx].isMemLoc());
3059 auto &ArgLoc = ArgLocs[InIdx];
3060 const Align KernelArgBaseAlign = Align(16);
3061 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3062 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
3063 unsigned NumAllocSGPRs =
3064 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3065
3066 // Fix alignment for hidden arguments.
3067 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
3068 if (!AlignedForImplictArgs) {
3069 ImplicitArgOffset =
3070 alignTo(LastExplicitArgOffset,
3071 Subtarget->getAlignmentForImplicitArgPtr()) -
3072 LastExplicitArgOffset;
3073 AlignedForImplictArgs = true;
3074 }
3075 ArgOffset += ImplicitArgOffset;
3076 }
3077
3078 // Arg is preloaded into the previous SGPR.
3079 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3080 assert(InIdx >= 1 && "No previous SGPR");
3081 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3082 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3083 continue;
3084 }
3085
3086 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3087 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
3088 // Check for free user SGPRs for preloading.
3089 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3090 InPreloadSequence = false;
3091 break;
3092 }
3093
3094 // Preload this argument.
3095 const TargetRegisterClass *RC =
3096 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3097 SmallVectorImpl<MCRegister> *PreloadRegs =
3098 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3099
3100 if (PreloadRegs->size() > 1)
3101 RC = &AMDGPU::SGPR_32RegClass;
3102 for (auto &Reg : *PreloadRegs) {
3103 assert(Reg);
3104 MF.addLiveIn(Reg, RC);
3105 CCInfo.AllocateReg(Reg);
3106 }
3107
3108 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3109 }
3110 }
3111}
3112
3114 const SIRegisterInfo &TRI,
3115 SIMachineFunctionInfo &Info) const {
3116 // Always allocate this last since it is a synthetic preload.
3117 if (Info.hasLDSKernelId()) {
3118 Register Reg = Info.addLDSKernelId();
3119 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3120 CCInfo.AllocateReg(Reg);
3121 }
3122}
3123
3124// Allocate special input registers that are initialized per-wave.
3127 CallingConv::ID CallConv,
3128 bool IsShader) const {
3129 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3130 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3131 // Note: user SGPRs are handled by the front-end for graphics shaders
3132 // Pad up the used user SGPRs with dead inputs.
3133
3134 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3135 // before enabling architected SGPRs for workgroup IDs.
3136 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3137
3138 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3139 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3140 // rely on it to reach 16 since if we end up having no stack usage, it will
3141 // not really be added.
3142 unsigned NumRequiredSystemSGPRs =
3143 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3144 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3145 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3146 Register Reg = Info.addReservedUserSGPR();
3147 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3148 CCInfo.AllocateReg(Reg);
3149 }
3150 }
3151
3152 if (!HasArchitectedSGPRs) {
3153 if (Info.hasWorkGroupIDX()) {
3154 Register Reg = Info.addWorkGroupIDX();
3155 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3156 CCInfo.AllocateReg(Reg);
3157 }
3158
3159 if (Info.hasWorkGroupIDY()) {
3160 Register Reg = Info.addWorkGroupIDY();
3161 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3162 CCInfo.AllocateReg(Reg);
3163 }
3164
3165 if (Info.hasWorkGroupIDZ()) {
3166 Register Reg = Info.addWorkGroupIDZ();
3167 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3168 CCInfo.AllocateReg(Reg);
3169 }
3170 }
3171
3172 if (Info.hasWorkGroupInfo()) {
3173 Register Reg = Info.addWorkGroupInfo();
3174 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3175 CCInfo.AllocateReg(Reg);
3176 }
3177
3178 if (Info.hasPrivateSegmentWaveByteOffset()) {
3179 // Scratch wave offset passed in system SGPR.
3180 unsigned PrivateSegmentWaveByteOffsetReg;
3181
3182 if (IsShader) {
3183 PrivateSegmentWaveByteOffsetReg =
3184 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3185
3186 // This is true if the scratch wave byte offset doesn't have a fixed
3187 // location.
3188 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3189 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3190 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3191 }
3192 } else
3193 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3194
3195 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3196 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3197 }
3198
3199 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3200 Info.getNumPreloadedSGPRs() >= 16);
3201}
3202
3204 MachineFunction &MF,
3205 const SIRegisterInfo &TRI,
3206 SIMachineFunctionInfo &Info) {
3207 // Now that we've figured out where the scratch register inputs are, see if
3208 // should reserve the arguments and use them directly.
3209 MachineFrameInfo &MFI = MF.getFrameInfo();
3210 bool HasStackObjects = MFI.hasStackObjects();
3211 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3212
3213 // Record that we know we have non-spill stack objects so we don't need to
3214 // check all stack objects later.
3215 if (HasStackObjects)
3216 Info.setHasNonSpillStackObjects(true);
3217
3218 // Everything live out of a block is spilled with fast regalloc, so it's
3219 // almost certain that spilling will be required.
3221 HasStackObjects = true;
3222
3223 // For now assume stack access is needed in any callee functions, so we need
3224 // the scratch registers to pass in.
3225 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3226
3227 if (!ST.hasFlatScratchEnabled()) {
3228 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3229 // If we have stack objects, we unquestionably need the private buffer
3230 // resource. For the Code Object V2 ABI, this will be the first 4 user
3231 // SGPR inputs. We can reserve those and use them directly.
3232
3233 Register PrivateSegmentBufferReg =
3235 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3236 } else {
3237 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3238 // We tentatively reserve the last registers (skipping the last registers
3239 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3240 // we'll replace these with the ones immediately after those which were
3241 // really allocated. In the prologue copies will be inserted from the
3242 // argument to these reserved registers.
3243
3244 // Without HSA, relocations are used for the scratch pointer and the
3245 // buffer resource setup is always inserted in the prologue. Scratch wave
3246 // offset is still in an input SGPR.
3247 Info.setScratchRSrcReg(ReservedBufferReg);
3248 }
3249 }
3250
3252
3253 // For entry functions we have to set up the stack pointer if we use it,
3254 // whereas non-entry functions get this "for free". This means there is no
3255 // intrinsic advantage to using S32 over S34 in cases where we do not have
3256 // calls but do need a frame pointer (i.e. if we are requested to have one
3257 // because frame pointer elimination is disabled). To keep things simple we
3258 // only ever use S32 as the call ABI stack pointer, and so using it does not
3259 // imply we need a separate frame pointer.
3260 //
3261 // Try to use s32 as the SP, but move it if it would interfere with input
3262 // arguments. This won't work with calls though.
3263 //
3264 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3265 // registers.
3266 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3267 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3268 } else {
3270
3271 if (MFI.hasCalls())
3272 report_fatal_error("call in graphics shader with too many input SGPRs");
3273
3274 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3275 if (!MRI.isLiveIn(Reg)) {
3276 Info.setStackPtrOffsetReg(Reg);
3277 break;
3278 }
3279 }
3280
3281 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3282 report_fatal_error("failed to find register for SP");
3283 }
3284
3285 // hasFP should be accurate for entry functions even before the frame is
3286 // finalized, because it does not rely on the known stack size, only
3287 // properties like whether variable sized objects are present.
3288 if (ST.getFrameLowering()->hasFP(MF)) {
3289 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3290 }
3291}
3292
3295 return !Info->isEntryFunction();
3296}
3297
3299
3301 MachineBasicBlock *Entry,
3302 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3304
3305 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3306 if (!IStart)
3307 return;
3308
3309 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3310 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3311 MachineBasicBlock::iterator MBBI = Entry->begin();
3312 for (const MCPhysReg *I = IStart; *I; ++I) {
3313 const TargetRegisterClass *RC = nullptr;
3314 if (AMDGPU::SReg_64RegClass.contains(*I))
3315 RC = &AMDGPU::SGPR_64RegClass;
3316 else if (AMDGPU::SReg_32RegClass.contains(*I))
3317 RC = &AMDGPU::SGPR_32RegClass;
3318 else
3319 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3320
3321 Register NewVR = MRI->createVirtualRegister(RC);
3322 // Create copy from CSR to a virtual register.
3323 Entry->addLiveIn(*I);
3324 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3325 .addReg(*I);
3326
3327 // Insert the copy-back instructions right before the terminator.
3328 for (auto *Exit : Exits)
3329 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3330 TII->get(TargetOpcode::COPY), *I)
3331 .addReg(NewVR);
3332 }
3333}
3334
3336 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3337 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3338 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3340
3342 const Function &Fn = MF.getFunction();
3345 bool IsError = false;
3346
3347 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3349 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3350 IsError = true;
3351 }
3352
3355 BitVector Skipped(Ins.size());
3356 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3357 *DAG.getContext());
3358
3359 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3360 bool IsKernel = AMDGPU::isKernel(CallConv);
3361 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3362
3363 if (IsGraphics) {
3364 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3365 assert(!UserSGPRInfo.hasDispatchPtr() &&
3366 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3367 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3368 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3369 (void)UserSGPRInfo;
3370 if (!Subtarget->hasFlatScratchEnabled())
3371 assert(!UserSGPRInfo.hasFlatScratchInit());
3372 if ((CallConv != CallingConv::AMDGPU_CS &&
3373 CallConv != CallingConv::AMDGPU_Gfx &&
3374 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3375 !Subtarget->hasArchitectedSGPRs())
3376 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3377 !Info->hasWorkGroupIDZ());
3378 }
3379
3380 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3381
3382 if (CallConv == CallingConv::AMDGPU_PS) {
3383 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3384
3385 // At least one interpolation mode must be enabled or else the GPU will
3386 // hang.
3387 //
3388 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3389 // set PSInputAddr, the user wants to enable some bits after the compilation
3390 // based on run-time states. Since we can't know what the final PSInputEna
3391 // will look like, so we shouldn't do anything here and the user should take
3392 // responsibility for the correct programming.
3393 //
3394 // Otherwise, the following restrictions apply:
3395 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3396 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3397 // enabled too.
3398 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3399 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3400 CCInfo.AllocateReg(AMDGPU::VGPR0);
3401 CCInfo.AllocateReg(AMDGPU::VGPR1);
3402 Info->markPSInputAllocated(0);
3403 Info->markPSInputEnabled(0);
3404 }
3405 if (Subtarget->isAmdPalOS()) {
3406 // For isAmdPalOS, the user does not enable some bits after compilation
3407 // based on run-time states; the register values being generated here are
3408 // the final ones set in hardware. Therefore we need to apply the
3409 // workaround to PSInputAddr and PSInputEnable together. (The case where
3410 // a bit is set in PSInputAddr but not PSInputEnable is where the
3411 // frontend set up an input arg for a particular interpolation mode, but
3412 // nothing uses that input arg. Really we should have an earlier pass
3413 // that removes such an arg.)
3414 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3415 if ((PsInputBits & 0x7F) == 0 ||
3416 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3417 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3418 }
3419 } else if (IsKernel) {
3420 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3421 } else {
3422 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3423 Ins.end());
3424 }
3425
3426 if (IsKernel)
3427 analyzeFormalArgumentsCompute(CCInfo, Ins);
3428
3429 if (IsEntryFunc) {
3430 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3431 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3432 if (IsKernel && Subtarget->hasKernargPreload())
3433 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3434
3435 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3436 } else if (!IsGraphics) {
3437 // For the fixed ABI, pass workitem IDs in the last argument register.
3438 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3439
3440 // FIXME: Sink this into allocateSpecialInputSGPRs
3441 if (!Subtarget->hasFlatScratchEnabled())
3442 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3443
3444 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3445 }
3446
3447 if (!IsKernel) {
3448 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3449 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3450
3451 // This assumes the registers are allocated by CCInfo in ascending order
3452 // with no gaps.
3453 Info->setNumWaveDispatchSGPRs(
3454 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3455 Info->setNumWaveDispatchVGPRs(
3456 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3457 } else if (Info->getNumKernargPreloadedSGPRs()) {
3458 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3459 }
3460
3462
3463 if (IsWholeWaveFunc) {
3464 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3465 {MVT::i1, MVT::Other}, Chain);
3466 InVals.push_back(Setup.getValue(0));
3467 Chains.push_back(Setup.getValue(1));
3468 }
3469
3470 // FIXME: This is the minimum kernel argument alignment. We should improve
3471 // this to the maximum alignment of the arguments.
3472 //
3473 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3474 // kern arg offset.
3475 const Align KernelArgBaseAlign = Align(16);
3476
3477 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3478 ++i) {
3479 const ISD::InputArg &Arg = Ins[i];
3480 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3481 InVals.push_back(DAG.getPOISON(Arg.VT));
3482 continue;
3483 }
3484
3485 CCValAssign &VA = ArgLocs[ArgIdx++];
3486 MVT VT = VA.getLocVT();
3487
3488 if (IsEntryFunc && VA.isMemLoc()) {
3489 VT = Ins[i].VT;
3490 EVT MemVT = VA.getLocVT();
3491
3492 const uint64_t Offset = VA.getLocMemOffset();
3493 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3494
3495 if (Arg.Flags.isByRef()) {
3496 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3497
3498 const GCNTargetMachine &TM =
3499 static_cast<const GCNTargetMachine &>(getTargetMachine());
3500 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3501 Arg.Flags.getPointerAddrSpace())) {
3504 }
3505
3506 InVals.push_back(Ptr);
3507 continue;
3508 }
3509
3510 SDValue NewArg;
3511 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3512 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3513 // In this case the argument is packed into the previous preload SGPR.
3514 int64_t AlignDownOffset = alignDown(Offset, 4);
3515 int64_t OffsetDiff = Offset - AlignDownOffset;
3516 EVT IntVT = MemVT.changeTypeToInteger();
3517
3518 const SIMachineFunctionInfo *Info =
3521 Register Reg =
3522 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3523
3524 assert(Reg);
3525 Register VReg = MRI.getLiveInVirtReg(Reg);
3526 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3527
3528 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3529 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3530
3531 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3532 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3533 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3534 Ins[i].Flags.isSExt(), &Ins[i]);
3535
3536 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3537 } else {
3538 const SIMachineFunctionInfo *Info =
3541 const SmallVectorImpl<MCRegister> &PreloadRegs =
3542 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3543
3544 SDValue Copy;
3545 if (PreloadRegs.size() == 1) {
3546 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3547 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3548 NewArg = DAG.getCopyFromReg(
3549 Chain, DL, VReg,
3551 TRI->getRegSizeInBits(*RC)));
3552
3553 } else {
3554 // If the kernarg alignment does not match the alignment of the SGPR
3555 // tuple RC that can accommodate this argument, it will be built up
3556 // via copies from from the individual SGPRs that the argument was
3557 // preloaded to.
3559 for (auto Reg : PreloadRegs) {
3560 Register VReg = MRI.getLiveInVirtReg(Reg);
3561 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3562 Elts.push_back(Copy);
3563 }
3564 NewArg =
3565 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3566 PreloadRegs.size()),
3567 DL, Elts);
3568 }
3569
3570 // If the argument was preloaded to multiple consecutive 32-bit
3571 // registers because of misalignment between addressable SGPR tuples
3572 // and the argument size, we can still assume that because of kernarg
3573 // segment alignment restrictions that NewArg's size is the same as
3574 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3575 // truncate since we cannot preload to less than a single SGPR and the
3576 // MemVT may be smaller.
3577 EVT MemVTInt =
3579 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3580 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3581
3582 NewArg = DAG.getBitcast(MemVT, NewArg);
3583 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3584 Ins[i].Flags.isSExt(), &Ins[i]);
3585 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3586 }
3587 } else {
3588 // Hidden arguments that are in the kernel signature must be preloaded
3589 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3590 // the argument list and is not preloaded.
3591 if (Arg.isOrigArg()) {
3592 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3593 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3595 *OrigArg->getParent(),
3596 "hidden argument in kernel signature was not preloaded",
3597 DL.getDebugLoc()));
3598 }
3599 }
3600
3601 NewArg =
3602 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3603 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3604 }
3605 Chains.push_back(NewArg.getValue(1));
3606
3607 auto *ParamTy =
3608 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3609 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3610 ParamTy &&
3611 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3612 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3613 // On SI local pointers are just offsets into LDS, so they are always
3614 // less than 16-bits. On CI and newer they could potentially be
3615 // real pointers, so we can't guarantee their size.
3616 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3617 DAG.getValueType(MVT::i16));
3618 }
3619
3620 InVals.push_back(NewArg);
3621 continue;
3622 }
3623 if (!IsEntryFunc && VA.isMemLoc()) {
3624 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3625 InVals.push_back(Val);
3626 if (!Arg.Flags.isByVal())
3627 Chains.push_back(Val.getValue(1));
3628 continue;
3629 }
3630
3631 assert(VA.isRegLoc() && "Parameter must be in a register!");
3632
3633 Register Reg = VA.getLocReg();
3634 const TargetRegisterClass *RC = nullptr;
3635 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3636 RC = &AMDGPU::VGPR_32RegClass;
3637 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3638 RC = &AMDGPU::SGPR_32RegClass;
3639 else
3640 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3641
3642 Reg = MF.addLiveIn(Reg, RC);
3643 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3644 if (Arg.Flags.isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3645 // FIXME: Need to forward the chains created by `CopyFromReg`s, make sure
3646 // they will read physical regs before any side effect instructions.
3647 SDValue ReadFirstLane =
3648 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3650 ReadFirstLane, Val);
3651 }
3652
3653 if (Arg.Flags.isSRet()) {
3654 // The return object should be reasonably addressable.
3655
3656 // FIXME: This helps when the return is a real sret. If it is a
3657 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3658 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3659 unsigned NumBits =
3661 Val = DAG.getNode(
3662 ISD::AssertZext, DL, VT, Val,
3663 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3664 }
3665
3666 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3667 InVals.push_back(Val);
3668 }
3669
3670 // Start adding system SGPRs.
3671 if (IsEntryFunc)
3672 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3673
3674 unsigned StackArgSize = CCInfo.getStackSize();
3675 Info->setBytesInStackArgArea(StackArgSize);
3676
3677 return Chains.empty() ? Chain
3678 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3679}
3680
3681// TODO: If return values can't fit in registers, we should return as many as
3682// possible in registers before passing on stack.
3684 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3685 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3686 const Type *RetTy) const {
3687 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3688 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3689 // for shaders. Vector types should be explicitly handled by CC.
3690 if (AMDGPU::isEntryFunctionCC(CallConv))
3691 return true;
3692
3694 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3695 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3696 return false;
3697
3698 // We must use the stack if return would require unavailable registers.
3699 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3700 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3701 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3702 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3703 return false;
3704
3705 return true;
3706}
3707
3708SDValue
3710 bool isVarArg,
3712 const SmallVectorImpl<SDValue> &OutVals,
3713 const SDLoc &DL, SelectionDAG &DAG) const {
3717
3718 if (AMDGPU::isKernel(CallConv)) {
3719 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3720 OutVals, DL, DAG);
3721 }
3722
3723 bool IsShader = AMDGPU::isShader(CallConv);
3724
3725 Info->setIfReturnsVoid(Outs.empty());
3726 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3727
3728 // CCValAssign - represent the assignment of the return value to a location.
3730
3731 // CCState - Info about the registers and stack slots.
3732 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3733 *DAG.getContext());
3734
3735 // Analyze outgoing return values.
3736 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3737
3738 SDValue Glue;
3740 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3741
3742 SDValue ReadFirstLane =
3743 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3744 // Copy the result values into the output registers.
3745 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3746 ++I, ++RealRVLocIdx) {
3747 CCValAssign &VA = RVLocs[I];
3748 assert(VA.isRegLoc() && "Can only return in registers!");
3749 // TODO: Partially return in registers if return values don't fit.
3750 SDValue Arg = OutVals[RealRVLocIdx];
3751
3752 // Copied from other backends.
3753 switch (VA.getLocInfo()) {
3754 case CCValAssign::Full:
3755 break;
3756 case CCValAssign::BCvt:
3757 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3758 break;
3759 case CCValAssign::SExt:
3760 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3761 break;
3762 case CCValAssign::ZExt:
3763 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3764 break;
3765 case CCValAssign::AExt:
3766 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3767 break;
3768 default:
3769 llvm_unreachable("Unknown loc info!");
3770 }
3771 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3773 ReadFirstLane, Arg);
3774 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3775 Glue = Chain.getValue(1);
3776 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3777 }
3778
3779 // FIXME: Does sret work properly?
3780 if (!Info->isEntryFunction()) {
3781 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3782 const MCPhysReg *I =
3783 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3784 if (I) {
3785 for (; *I; ++I) {
3786 if (AMDGPU::SReg_64RegClass.contains(*I))
3787 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3788 else if (AMDGPU::SReg_32RegClass.contains(*I))
3789 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3790 else
3791 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3792 }
3793 }
3794 }
3795
3796 // Update chain and glue.
3797 RetOps[0] = Chain;
3798 if (Glue.getNode())
3799 RetOps.push_back(Glue);
3800
3801 unsigned Opc = AMDGPUISD::ENDPGM;
3802 if (!IsWaveEnd)
3803 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3804 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3805 : AMDGPUISD::RET_GLUE;
3806 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3807}
3808
3810 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3811 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3812 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3813 SDValue ThisVal) const {
3814 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3815
3816 // Assign locations to each value returned by this call.
3818 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3819 *DAG.getContext());
3820 CCInfo.AnalyzeCallResult(Ins, RetCC);
3821
3822 // Copy all of the result registers out of their specified physreg.
3823 for (CCValAssign VA : RVLocs) {
3824 SDValue Val;
3825
3826 if (VA.isRegLoc()) {
3827 Val =
3828 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3829 Chain = Val.getValue(1);
3830 InGlue = Val.getValue(2);
3831 } else if (VA.isMemLoc()) {
3832 report_fatal_error("TODO: return values in memory");
3833 } else
3834 llvm_unreachable("unknown argument location type");
3835
3836 switch (VA.getLocInfo()) {
3837 case CCValAssign::Full:
3838 break;
3839 case CCValAssign::BCvt:
3840 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3841 break;
3842 case CCValAssign::ZExt:
3843 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3844 DAG.getValueType(VA.getValVT()));
3845 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3846 break;
3847 case CCValAssign::SExt:
3848 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3849 DAG.getValueType(VA.getValVT()));
3850 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3851 break;
3852 case CCValAssign::AExt:
3853 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3854 break;
3855 default:
3856 llvm_unreachable("Unknown loc info!");
3857 }
3858
3859 InVals.push_back(Val);
3860 }
3861
3862 return Chain;
3863}
3864
3865// Add code to pass special inputs required depending on used features separate
3866// from the explicit user arguments present in the IR.
3868 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3869 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3870 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3871 // If we don't have a call site, this was a call inserted by
3872 // legalization. These can never use special inputs.
3873 if (!CLI.CB)
3874 return;
3875
3876 SelectionDAG &DAG = CLI.DAG;
3877 const SDLoc &DL = CLI.DL;
3878 const Function &F = DAG.getMachineFunction().getFunction();
3879
3880 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3881 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3882
3883 const AMDGPUFunctionArgInfo &CalleeArgInfo =
3885
3886 // TODO: Unify with private memory register handling. This is complicated by
3887 // the fact that at least in kernels, the input argument is not necessarily
3888 // in the same location as the input.
3889 // clang-format off
3890 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3891 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3892 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3893 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3894 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3895 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3896 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3897 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3898 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3899 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3900 };
3901 // clang-format on
3902
3903 for (auto [InputID, Attrs] : ImplicitAttrs) {
3904 // If the callee does not use the attribute value, skip copying the value.
3905 if (all_of(Attrs, [&](StringRef Attr) {
3906 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3907 }))
3908 continue;
3909
3910 const auto [OutgoingArg, ArgRC, ArgTy] =
3911 CalleeArgInfo.getPreloadedValue(InputID);
3912 if (!OutgoingArg)
3913 continue;
3914
3915 const auto [IncomingArg, IncomingArgRC, Ty] =
3916 CallerArgInfo.getPreloadedValue(InputID);
3917 assert(IncomingArgRC == ArgRC);
3918
3919 // All special arguments are ints for now.
3920 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3921 SDValue InputReg;
3922
3923 if (IncomingArg) {
3924 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3925 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3926 // The implicit arg ptr is special because it doesn't have a corresponding
3927 // input for kernels, and is computed from the kernarg segment pointer.
3928 InputReg = getImplicitArgPtr(DAG, DL);
3929 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3930 std::optional<uint32_t> Id =
3932 if (Id.has_value()) {
3933 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3934 } else {
3935 InputReg = DAG.getPOISON(ArgVT);
3936 }
3937 } else {
3938 // We may have proven the input wasn't needed, although the ABI is
3939 // requiring it. We just need to allocate the register appropriately.
3940 InputReg = DAG.getPOISON(ArgVT);
3941 }
3942
3943 if (OutgoingArg->isRegister()) {
3944 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3945 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3946 report_fatal_error("failed to allocate implicit input argument");
3947 } else {
3948 unsigned SpecialArgOffset =
3949 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3950 SDValue ArgStore =
3951 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3952 MemOpChains.push_back(ArgStore);
3953 }
3954 }
3955
3956 // Pack workitem IDs into a single register or pass it as is if already
3957 // packed.
3958
3959 auto [OutgoingArg, ArgRC, Ty] =
3961 if (!OutgoingArg)
3962 std::tie(OutgoingArg, ArgRC, Ty) =
3964 if (!OutgoingArg)
3965 std::tie(OutgoingArg, ArgRC, Ty) =
3967 if (!OutgoingArg)
3968 return;
3969
3970 const ArgDescriptor *IncomingArgX = std::get<0>(
3972 const ArgDescriptor *IncomingArgY = std::get<0>(
3974 const ArgDescriptor *IncomingArgZ = std::get<0>(
3976
3977 SDValue InputReg;
3978 SDLoc SL;
3979
3980 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3981 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3982 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3983
3984 // If incoming ids are not packed we need to pack them.
3985 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
3986 NeedWorkItemIDX) {
3987 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3988 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3989 } else {
3990 InputReg = DAG.getConstant(0, DL, MVT::i32);
3991 }
3992 }
3993
3994 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
3995 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3996 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3997 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3998 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3999 InputReg = InputReg.getNode()
4000 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
4001 : Y;
4002 }
4003
4004 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
4005 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
4006 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
4007 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
4008 DAG.getShiftAmountConstant(20, MVT::i32, SL));
4009 InputReg = InputReg.getNode()
4010 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
4011 : Z;
4012 }
4013
4014 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4015 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4016 // We're in a situation where the outgoing function requires the workitem
4017 // ID, but the calling function does not have it (e.g a graphics function
4018 // calling a C calling convention function). This is illegal, but we need
4019 // to produce something.
4020 InputReg = DAG.getPOISON(MVT::i32);
4021 } else {
4022 // Workitem ids are already packed, any of present incoming arguments
4023 // will carry all required fields.
4024 ArgDescriptor IncomingArg =
4025 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
4026 : IncomingArgY ? *IncomingArgY
4027 : *IncomingArgZ,
4028 ~0u);
4029 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
4030 }
4031 }
4032
4033 if (OutgoingArg->isRegister()) {
4034 if (InputReg)
4035 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4036
4037 CCInfo.AllocateReg(OutgoingArg->getRegister());
4038 } else {
4039 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
4040 if (InputReg) {
4041 SDValue ArgStore =
4042 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
4043 MemOpChains.push_back(ArgStore);
4044 }
4045 }
4046}
4047
4049 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4051 const SmallVectorImpl<SDValue> &OutVals,
4052 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4053 if (AMDGPU::isChainCC(CalleeCC))
4054 return true;
4055
4056 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
4057 return false;
4058
4059 // For a divergent call target, we need to do a waterfall loop over the
4060 // possible callees which precludes us from using a simple jump.
4061 if (Callee->isDivergent())
4062 return false;
4063
4065 const Function &CallerF = MF.getFunction();
4066 CallingConv::ID CallerCC = CallerF.getCallingConv();
4068 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4069
4070 // Kernels aren't callable, and don't have a live in return address so it
4071 // doesn't make sense to do a tail call with entry functions.
4072 if (!CallerPreserved)
4073 return false;
4074
4075 bool CCMatch = CallerCC == CalleeCC;
4076
4078 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
4079 return true;
4080 return false;
4081 }
4082
4083 // TODO: Can we handle var args?
4084 if (IsVarArg)
4085 return false;
4086
4087 for (const Argument &Arg : CallerF.args()) {
4088 if (Arg.hasByValAttr())
4089 return false;
4090 }
4091
4092 LLVMContext &Ctx = *DAG.getContext();
4093
4094 // Check that the call results are passed in the same way.
4095 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4096 CCAssignFnForCall(CalleeCC, IsVarArg),
4097 CCAssignFnForCall(CallerCC, IsVarArg)))
4098 return false;
4099
4100 // The callee has to preserve all registers the caller needs to preserve.
4101 if (!CCMatch) {
4102 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4103 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4104 return false;
4105 }
4106
4107 // Nothing more to check if the callee is taking no arguments.
4108 if (Outs.empty())
4109 return true;
4110
4112 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4113
4114 // FIXME: We are not allocating special input registers, so we will be
4115 // deciding based on incorrect register assignments.
4116 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4117
4118 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4119 // If the stack arguments for this call do not fit into our own save area then
4120 // the call cannot be made tail.
4121 // TODO: Is this really necessary?
4122 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4123 return false;
4124
4125 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4126 // FIXME: What about inreg arguments that end up passed in memory?
4127 if (!CCVA.isRegLoc())
4128 continue;
4129
4130 // If we are passing an argument in an SGPR, and the value is divergent,
4131 // this call requires a waterfall loop.
4132 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4133 LLVM_DEBUG(
4134 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4135 << printReg(CCVA.getLocReg(), TRI) << '\n');
4136 return false;
4137 }
4138 }
4139
4140 const MachineRegisterInfo &MRI = MF.getRegInfo();
4141 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4142}
4143
4145 if (!CI->isTailCall())
4146 return false;
4147
4148 const Function *ParentFn = CI->getFunction();
4150 return false;
4151 return true;
4152}
4153
4154namespace {
4155// Chain calls have special arguments that we need to handle. These are
4156// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4157// arguments (index 0 and 1 respectively).
4158enum ChainCallArgIdx {
4159 Exec = 2,
4160 Flags,
4161 NumVGPRs,
4162 FallbackExec,
4163 FallbackCallee
4164};
4165} // anonymous namespace
4166
4167// The wave scratch offset register is used as the global base pointer.
4169 SmallVectorImpl<SDValue> &InVals) const {
4170 CallingConv::ID CallConv = CLI.CallConv;
4171 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4172
4173 SelectionDAG &DAG = CLI.DAG;
4174
4175 const SDLoc &DL = CLI.DL;
4176 SDValue Chain = CLI.Chain;
4177 SDValue Callee = CLI.Callee;
4178
4179 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4180 bool UsesDynamicVGPRs = false;
4181 if (IsChainCallConv) {
4182 // The last arguments should be the value that we need to put in EXEC,
4183 // followed by the flags and any other arguments with special meanings.
4184 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4185 // we don't treat them like the "real" arguments.
4186 auto RequestedExecIt =
4187 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4188 return Arg.OrigArgIndex == 2;
4189 });
4190 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4191
4192 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4193 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4194 CLI.OutVals.end());
4195 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4196
4197 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4198 "Haven't popped all the special args");
4199
4200 TargetLowering::ArgListEntry RequestedExecArg =
4201 CLI.Args[ChainCallArgIdx::Exec];
4202 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4203 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4204
4205 // Convert constants into TargetConstants, so they become immediate operands
4206 // instead of being selected into S_MOV.
4207 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4208 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4209 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4210 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4211 } else
4212 ChainCallSpecialArgs.push_back(Arg.Node);
4213 };
4214
4215 PushNodeOrTargetConstant(RequestedExecArg);
4216
4217 // Process any other special arguments depending on the value of the flags.
4218 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4219
4220 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4221 if (FlagsValue.isZero()) {
4222 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4223 return lowerUnhandledCall(CLI, InVals,
4224 "no additional args allowed if flags == 0");
4225 } else if (FlagsValue.isOneBitSet(0)) {
4226 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4227 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4228 }
4229
4230 if (!Subtarget->isWave32()) {
4231 return lowerUnhandledCall(
4232 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4233 }
4234
4235 UsesDynamicVGPRs = true;
4236 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4237 CLI.Args.end(), PushNodeOrTargetConstant);
4238 }
4239 }
4240
4242 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4244 bool &IsTailCall = CLI.IsTailCall;
4245 bool IsVarArg = CLI.IsVarArg;
4246 bool IsSibCall = false;
4248
4249 if (Callee.isUndef() || isNullConstant(Callee)) {
4250 if (!CLI.IsTailCall) {
4251 for (ISD::InputArg &Arg : CLI.Ins)
4252 InVals.push_back(DAG.getPOISON(Arg.VT));
4253 }
4254
4255 return Chain;
4256 }
4257
4258 if (IsVarArg) {
4259 return lowerUnhandledCall(CLI, InVals,
4260 "unsupported call to variadic function ");
4261 }
4262
4263 if (!CLI.CB)
4264 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4265
4266 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4267 return lowerUnhandledCall(CLI, InVals,
4268 "unsupported required tail call to function ");
4269 }
4270
4271 if (IsTailCall) {
4272 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4273 Outs, OutVals, Ins, DAG);
4274 if (!IsTailCall &&
4275 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4276 report_fatal_error("failed to perform tail call elimination on a call "
4277 "site marked musttail or on llvm.amdgcn.cs.chain");
4278 }
4279
4280 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4281
4282 // A sibling call is one where we're under the usual C ABI and not planning
4283 // to change that but can still do a tail call:
4284 if (!TailCallOpt && IsTailCall)
4285 IsSibCall = true;
4286
4287 if (IsTailCall)
4288 ++NumTailCalls;
4289 }
4290
4293 SmallVector<SDValue, 8> MemOpChains;
4294
4295 // Analyze operands of the call, assigning locations to each operand.
4297 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4298 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4299
4300 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4302 // With a fixed ABI, allocate fixed registers before user arguments.
4303 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4304 }
4305
4306 // Mark the scratch resource descriptor as allocated so the CC analysis
4307 // does not assign user arguments to these registers, matching the callee.
4308 if (!Subtarget->hasFlatScratchEnabled())
4309 CCInfo.AllocateReg(Info->getScratchRSrcReg());
4310
4311 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4312
4313 // Get a count of how many bytes are to be pushed on the stack.
4314 unsigned NumBytes = CCInfo.getStackSize();
4315
4316 if (IsSibCall) {
4317 // Since we're not changing the ABI to make this a tail call, the memory
4318 // operands are already available in the caller's incoming argument space.
4319 NumBytes = 0;
4320 }
4321
4322 // FPDiff is the byte offset of the call's argument area from the callee's.
4323 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4324 // by this amount for a tail call. In a sibling call it must be 0 because the
4325 // caller will deallocate the entire stack and the callee still expects its
4326 // arguments to begin at SP+0. Completely unused for non-tail calls.
4327 int32_t FPDiff = 0;
4328 MachineFrameInfo &MFI = MF.getFrameInfo();
4329 auto *TRI = Subtarget->getRegisterInfo();
4330
4331 // Adjust the stack pointer for the new arguments...
4332 // These operations are automatically eliminated by the prolog/epilog pass
4333 if (!IsSibCall)
4334 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4335
4336 if (!IsSibCall || IsChainCallConv) {
4337 if (!Subtarget->hasFlatScratchEnabled()) {
4338 SmallVector<SDValue, 4> CopyFromChains;
4339
4340 // In the HSA case, this should be an identity copy.
4341 SDValue ScratchRSrcReg =
4342 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4343 RegsToPass.emplace_back(IsChainCallConv
4344 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4345 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4346 ScratchRSrcReg);
4347 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4348 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4349 }
4350 }
4351
4352 const unsigned NumSpecialInputs = RegsToPass.size();
4353
4354 MVT PtrVT = MVT::i32;
4355
4356 // Walk the register/memloc assignments, inserting copies/loads.
4357 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4358 CCValAssign &VA = ArgLocs[i];
4359 SDValue Arg = OutVals[i];
4360
4361 // Promote the value if needed.
4362 switch (VA.getLocInfo()) {
4363 case CCValAssign::Full:
4364 break;
4365 case CCValAssign::BCvt:
4366 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4367 break;
4368 case CCValAssign::ZExt:
4369 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4370 break;
4371 case CCValAssign::SExt:
4372 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4373 break;
4374 case CCValAssign::AExt:
4375 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4376 break;
4377 case CCValAssign::FPExt:
4378 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4379 break;
4380 default:
4381 llvm_unreachable("Unknown loc info!");
4382 }
4383
4384 if (VA.isRegLoc()) {
4385 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4386 } else {
4387 assert(VA.isMemLoc());
4388
4389 SDValue DstAddr;
4390 MachinePointerInfo DstInfo;
4391
4392 unsigned LocMemOffset = VA.getLocMemOffset();
4393 int32_t Offset = LocMemOffset;
4394
4395 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4396 MaybeAlign Alignment;
4397
4398 if (IsTailCall) {
4399 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4400 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4401 : VA.getValVT().getStoreSize();
4402
4403 // FIXME: We can have better than the minimum byval required alignment.
4404 Alignment =
4405 Flags.isByVal()
4406 ? Flags.getNonZeroByValAlign()
4407 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4408
4409 Offset = Offset + FPDiff;
4410 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4411
4412 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4413 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4414
4415 // Make sure any stack arguments overlapping with where we're storing
4416 // are loaded before this eventual operation. Otherwise they'll be
4417 // clobbered.
4418
4419 // FIXME: Why is this really necessary? This seems to just result in a
4420 // lot of code to copy the stack and write them back to the same
4421 // locations, which are supposed to be immutable?
4422 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4423 } else {
4424 // Stores to the argument stack area are relative to the stack pointer.
4425 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4426 MVT::i32);
4427 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4428 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4429 Alignment =
4430 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4431 }
4432
4433 if (Outs[i].Flags.isByVal()) {
4434 SDValue SizeNode =
4435 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4436 SDValue Cpy =
4437 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4438 Outs[i].Flags.getNonZeroByValAlign(),
4439 /*isVol = */ false, /*AlwaysInline = */ true,
4440 /*CI=*/nullptr, std::nullopt, DstInfo,
4442
4443 MemOpChains.push_back(Cpy);
4444 } else {
4445 SDValue Store =
4446 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4447 MemOpChains.push_back(Store);
4448 }
4449 }
4450 }
4451
4452 if (!MemOpChains.empty())
4453 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4454
4455 SDValue ReadFirstLaneID =
4456 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4457
4458 SDValue TokenGlue;
4459 if (CLI.ConvergenceControlToken) {
4460 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4462 }
4463
4464 // Build a sequence of copy-to-reg nodes chained together with token chain
4465 // and flag operands which copy the outgoing args into the appropriate regs.
4466 SDValue InGlue;
4467
4468 unsigned ArgIdx = 0;
4469 for (auto [Reg, Val] : RegsToPass) {
4470 if (ArgIdx++ >= NumSpecialInputs &&
4471 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4472 // For chain calls, the inreg arguments are required to be
4473 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4474 // they are uniform.
4475 //
4476 // For other calls, if an inreg arguments is known to be uniform,
4477 // speculatively insert a readfirstlane in case it is in a VGPR.
4478 //
4479 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4480 // value, so let that continue to produce invalid code.
4481
4482 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4483 if (TokenGlue)
4484 ReadfirstlaneArgs.push_back(TokenGlue);
4486 ReadfirstlaneArgs);
4487 }
4488
4489 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4490 InGlue = Chain.getValue(1);
4491 }
4492
4493 // We don't usually want to end the call-sequence here because we would tidy
4494 // the frame up *after* the call, however in the ABI-changing tail-call case
4495 // we've carefully laid out the parameters so that when sp is reset they'll be
4496 // in the correct location.
4497 if (IsTailCall && !IsSibCall) {
4498 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4499 InGlue = Chain.getValue(1);
4500 }
4501
4502 std::vector<SDValue> Ops({Chain});
4503
4504 // Add a redundant copy of the callee global which will not be legalized, as
4505 // we need direct access to the callee later.
4507 const GlobalValue *GV = GSD->getGlobal();
4508 Ops.push_back(Callee);
4509 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4510 } else {
4511 if (IsTailCall) {
4512 // isEligibleForTailCallOptimization considered whether the call target is
4513 // divergent, but we may still end up with a uniform value in a VGPR.
4514 // Insert a readfirstlane just in case.
4515 SDValue ReadFirstLaneID =
4516 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4517
4518 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4519 if (TokenGlue)
4520 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4521 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4522 ReadfirstlaneArgs);
4523 }
4524
4525 Ops.push_back(Callee);
4526 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4527 }
4528
4529 if (IsTailCall) {
4530 // Each tail call may have to adjust the stack by a different amount, so
4531 // this information must travel along with the operation for eventual
4532 // consumption by emitEpilogue.
4533 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4534 }
4535
4536 if (IsChainCallConv)
4537 llvm::append_range(Ops, ChainCallSpecialArgs);
4538
4539 // Add argument registers to the end of the list so that they are known live
4540 // into the call.
4541 for (auto &[Reg, Val] : RegsToPass)
4542 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4543
4544 // Add a register mask operand representing the call-preserved registers.
4545 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4546 assert(Mask && "Missing call preserved mask for calling convention");
4547 Ops.push_back(DAG.getRegisterMask(Mask));
4548
4549 if (SDValue Token = CLI.ConvergenceControlToken) {
4551 GlueOps.push_back(Token);
4552 if (InGlue)
4553 GlueOps.push_back(InGlue);
4554
4555 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4556 MVT::Glue, GlueOps),
4557 0);
4558 }
4559
4560 if (InGlue)
4561 Ops.push_back(InGlue);
4562
4563 // If we're doing a tall call, use a TC_RETURN here rather than an
4564 // actual call instruction.
4565 if (IsTailCall) {
4566 MFI.setHasTailCall();
4567 unsigned OPC = AMDGPUISD::TC_RETURN;
4568 switch (CallConv) {
4570 OPC = AMDGPUISD::TC_RETURN_GFX;
4571 break;
4574 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4575 : AMDGPUISD::TC_RETURN_CHAIN;
4576 break;
4577 }
4578
4579 // If the caller is a whole wave function, we need to use a special opcode
4580 // so we can patch up EXEC.
4581 if (Info->isWholeWaveFunction())
4582 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4583
4584 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4585 }
4586
4587 // Returns a chain and a flag for retval copy to use.
4588 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4589 Chain = Call.getValue(0);
4590 InGlue = Call.getValue(1);
4591
4592 uint64_t CalleePopBytes = NumBytes;
4593 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4594 if (!Ins.empty())
4595 InGlue = Chain.getValue(1);
4596
4597 // Handle result values, copying them out of physregs into vregs that we
4598 // return.
4599 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4600 InVals, /*IsThisReturn=*/false, SDValue());
4601}
4602
4603// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4604// except for:
4605// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4606// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4608 SelectionDAG &DAG) const {
4609 const MachineFunction &MF = DAG.getMachineFunction();
4611
4612 SDLoc dl(Op);
4613 EVT VT = Op.getValueType();
4614 SDValue Chain = Op.getOperand(0);
4615 Register SPReg = Info->getStackPtrOffsetReg();
4616
4617 // Chain the dynamic stack allocation so that it doesn't modify the stack
4618 // pointer when other instructions are using the stack.
4619 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4620
4621 SDValue Size = Op.getOperand(1);
4622 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4623 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4624
4625 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4627 "Stack grows upwards for AMDGPU");
4628
4629 Chain = BaseAddr.getValue(1);
4630 Align StackAlign = TFL->getStackAlign();
4631 if (Alignment > StackAlign) {
4632 uint64_t ScaledAlignment = Alignment.value()
4633 << Subtarget->getWavefrontSizeLog2();
4634 uint64_t StackAlignMask = ScaledAlignment - 1;
4635 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4636 DAG.getConstant(StackAlignMask, dl, VT));
4637 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4638 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4639 }
4640
4641 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4642 SDValue NewSP;
4644 // For constant sized alloca, scale alloca size by wave-size
4645 SDValue ScaledSize = DAG.getNode(
4646 ISD::SHL, dl, VT, Size,
4647 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4648 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4649 } else {
4650 // For dynamic sized alloca, perform wave-wide reduction to get max of
4651 // alloca size(divergent) and then scale it by wave-size
4652 SDValue WaveReduction =
4653 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4654 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4655 Size, DAG.getConstant(0, dl, MVT::i32));
4656 SDValue ScaledSize = DAG.getNode(
4657 ISD::SHL, dl, VT, Size,
4658 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4659 NewSP =
4660 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4661 SDValue ReadFirstLaneID =
4662 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4663 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4664 NewSP);
4665 }
4666
4667 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4668 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4669
4670 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4671}
4672
4674 if (Op.getValueType() != MVT::i32)
4675 return Op; // Defer to cannot select error.
4676
4678 SDLoc SL(Op);
4679
4680 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4681
4682 // Convert from wave uniform to swizzled vector address. This should protect
4683 // from any edge cases where the stacksave result isn't directly used with
4684 // stackrestore.
4685 SDValue VectorAddress =
4686 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4687 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4688}
4689
4691 SelectionDAG &DAG) const {
4692 SDLoc SL(Op);
4693 assert(Op.getValueType() == MVT::i32);
4694
4695 uint32_t BothRoundHwReg =
4697 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4698
4699 SDValue IntrinID =
4700 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4701 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4702 Op.getOperand(0), IntrinID, GetRoundBothImm);
4703
4704 // There are two rounding modes, one for f32 and one for f64/f16. We only
4705 // report in the standard value range if both are the same.
4706 //
4707 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4708 // ties away from zero is not supported, and the other values are rotated by
4709 // 1.
4710 //
4711 // If the two rounding modes are not the same, report a target defined value.
4712
4713 // Mode register rounding mode fields:
4714 //
4715 // [1:0] Single-precision round mode.
4716 // [3:2] Double/Half-precision round mode.
4717 //
4718 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4719 //
4720 // Hardware Spec
4721 // Toward-0 3 0
4722 // Nearest Even 0 1
4723 // +Inf 1 2
4724 // -Inf 2 3
4725 // NearestAway0 N/A 4
4726 //
4727 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4728 // table we can index by the raw hardware mode.
4729 //
4730 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4731
4732 SDValue BitTable =
4734
4735 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4736 SDValue RoundModeTimesNumBits =
4737 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4738
4739 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4740 // knew only one mode was demanded.
4741 SDValue TableValue =
4742 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4743 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4744
4745 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4746 SDValue TableEntry =
4747 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4748
4749 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4750 // if it's an extended value.
4751 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4752 SDValue IsStandardValue =
4753 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4754 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4755 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4756 TableEntry, EnumOffset);
4757
4758 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4759}
4760
4762 SelectionDAG &DAG) const {
4763 SDLoc SL(Op);
4764
4765 SDValue NewMode = Op.getOperand(1);
4766 assert(NewMode.getValueType() == MVT::i32);
4767
4768 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4769 // hardware MODE.fp_round values.
4770 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4771 uint32_t ClampedVal = std::min(
4772 static_cast<uint32_t>(ConstMode->getZExtValue()),
4774 NewMode = DAG.getConstant(
4775 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4776 } else {
4777 // If we know the input can only be one of the supported standard modes in
4778 // the range 0-3, we can use a simplified mapping to hardware values.
4779 KnownBits KB = DAG.computeKnownBits(NewMode);
4780 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4781 // The supported standard values are 0-3. The extended values start at 8. We
4782 // need to offset by 4 if the value is in the extended range.
4783
4784 if (UseReducedTable) {
4785 // Truncate to the low 32-bits.
4786 SDValue BitTable = DAG.getConstant(
4787 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4788
4789 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4790 SDValue RoundModeTimesNumBits =
4791 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4792
4793 NewMode =
4794 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4795
4796 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4797 // the table extracted bits into inline immediates.
4798 } else {
4799 // table_index = umin(value, value - 4)
4800 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4801 SDValue BitTable =
4803
4804 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4805 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4806 SDValue IndexVal =
4807 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4808
4809 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4810 SDValue RoundModeTimesNumBits =
4811 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4812
4813 SDValue TableValue =
4814 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4815 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4816
4817 // No need to mask out the high bits since the setreg will ignore them
4818 // anyway.
4819 NewMode = TruncTable;
4820 }
4821
4822 // Insert a readfirstlane in case the value is a VGPR. We could do this
4823 // earlier and keep more operations scalar, but that interferes with
4824 // combining the source.
4825 SDValue ReadFirstLaneID =
4826 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4827 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4828 ReadFirstLaneID, NewMode);
4829 }
4830
4831 // N.B. The setreg will be later folded into s_round_mode on supported
4832 // targets.
4833 SDValue IntrinID =
4834 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4835 uint32_t BothRoundHwReg =
4837 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4838
4839 SDValue SetReg =
4840 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4841 IntrinID, RoundBothImm, NewMode);
4842
4843 return SetReg;
4844}
4845
4847 if (Op->isDivergent() &&
4848 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4849 // Cannot do I$ prefetch with divergent pointer.
4850 return SDValue();
4851
4852 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4856 break;
4858 if (Subtarget->hasSafeSmemPrefetch())
4859 break;
4860 [[fallthrough]];
4861 default:
4862 return SDValue();
4863 }
4864
4865 // I$ prefetch
4866 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4867 return SDValue();
4868
4869 return Op;
4870}
4871
4872// Work around DAG legality rules only based on the result type.
4874 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4875 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4876 EVT SrcVT = Src.getValueType();
4877
4878 if (SrcVT.getScalarType() != MVT::bf16)
4879 return Op;
4880
4881 SDLoc SL(Op);
4882 SDValue BitCast =
4883 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4884
4885 EVT DstVT = Op.getValueType();
4886 if (IsStrict)
4887 llvm_unreachable("Need STRICT_BF16_TO_FP");
4888
4889 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4890}
4891
4893 SDLoc SL(Op);
4894 if (Op.getValueType() != MVT::i64)
4895 return Op;
4896
4897 uint32_t ModeHwReg =
4899 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4900 uint32_t TrapHwReg =
4902 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4903
4904 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4905 SDValue IntrinID =
4906 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4907 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4908 Op.getOperand(0), IntrinID, ModeHwRegImm);
4909 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4910 Op.getOperand(0), IntrinID, TrapHwRegImm);
4911 SDValue TokenReg =
4912 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4913 GetTrapReg.getValue(1));
4914
4915 SDValue CvtPtr =
4916 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4917 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4918
4919 return DAG.getMergeValues({Result, TokenReg}, SL);
4920}
4921
4923 SDLoc SL(Op);
4924 if (Op.getOperand(1).getValueType() != MVT::i64)
4925 return Op;
4926
4927 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4928 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4929 DAG.getConstant(0, SL, MVT::i32));
4930 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4931 DAG.getConstant(1, SL, MVT::i32));
4932
4933 SDValue ReadFirstLaneID =
4934 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4935 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4936 ReadFirstLaneID, NewModeReg);
4937 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4938 ReadFirstLaneID, NewTrapReg);
4939
4940 unsigned ModeHwReg =
4942 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4943 unsigned TrapHwReg =
4945 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4946
4947 SDValue IntrinID =
4948 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4949 SDValue SetModeReg =
4950 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4951 IntrinID, ModeHwRegImm, NewModeReg);
4952 SDValue SetTrapReg =
4953 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4954 IntrinID, TrapHwRegImm, NewTrapReg);
4955 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4956}
4957
4959 const MachineFunction &MF) const {
4960 const Function &Fn = MF.getFunction();
4961
4963 .Case("m0", AMDGPU::M0)
4964 .Case("exec", AMDGPU::EXEC)
4965 .Case("exec_lo", AMDGPU::EXEC_LO)
4966 .Case("exec_hi", AMDGPU::EXEC_HI)
4967 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4968 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4969 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4970 .Default(Register());
4971 if (!Reg)
4972 return Reg;
4973
4974 if (!Subtarget->hasFlatScrRegister() &&
4975 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4976 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4977 "\" for subtarget."));
4978 }
4979
4980 switch (Reg) {
4981 case AMDGPU::M0:
4982 case AMDGPU::EXEC_LO:
4983 case AMDGPU::EXEC_HI:
4984 case AMDGPU::FLAT_SCR_LO:
4985 case AMDGPU::FLAT_SCR_HI:
4986 if (VT.getSizeInBits() == 32)
4987 return Reg;
4988 break;
4989 case AMDGPU::EXEC:
4990 case AMDGPU::FLAT_SCR:
4991 if (VT.getSizeInBits() == 64)
4992 return Reg;
4993 break;
4994 default:
4995 llvm_unreachable("missing register type checking");
4996 }
4997
4999 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
5000}
5001
5002// If kill is not the last instruction, split the block so kill is always a
5003// proper terminator.
5006 MachineBasicBlock *BB) const {
5007 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
5009 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
5010 return SplitBB;
5011}
5012
5013// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
5014// \p MI will be the only instruction in the loop body block. Otherwise, it will
5015// be the first instruction in the remainder block.
5016//
5017/// \returns { LoopBody, Remainder }
5018static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5020 MachineFunction *MF = MBB.getParent();
5022
5023 // To insert the loop we need to split the block. Move everything after this
5024 // point to a new block, and insert a new empty block between the two.
5026 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
5028 ++MBBI;
5029
5030 MF->insert(MBBI, LoopBB);
5031 MF->insert(MBBI, RemainderBB);
5032
5033 LoopBB->addSuccessor(LoopBB);
5034 LoopBB->addSuccessor(RemainderBB);
5035
5036 // Move the rest of the block into a new block.
5037 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
5038
5039 if (InstInLoop) {
5040 auto Next = std::next(I);
5041
5042 // Move instruction to loop body.
5043 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
5044
5045 // Move the rest of the block.
5046 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
5047 } else {
5048 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
5049 }
5050
5051 MBB.addSuccessor(LoopBB);
5052
5053 return std::pair(LoopBB, RemainderBB);
5054}
5055
5056/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5058 MachineBasicBlock *MBB = MI.getParent();
5060 auto I = MI.getIterator();
5061 auto E = std::next(I);
5062
5063 // clang-format off
5064 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
5065 .addImm(0);
5066 // clang-format on
5067
5068 MIBundleBuilder Bundler(*MBB, I, E);
5069 finalizeBundle(*MBB, Bundler.begin());
5070}
5071
5074 MachineBasicBlock *BB) const {
5075 const DebugLoc &DL = MI.getDebugLoc();
5076
5078
5080
5081 // Apparently kill flags are only valid if the def is in the same block?
5082 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
5083 Src->setIsKill(false);
5084
5085 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5086
5087 MachineBasicBlock::iterator I = LoopBB->end();
5088
5089 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5091
5092 // Clear TRAP_STS.MEM_VIOL
5093 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5094 .addImm(0)
5095 .addImm(EncodedReg);
5096
5098
5099 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5100
5101 // Load and check TRAP_STS.MEM_VIOL
5102 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5103 .addImm(EncodedReg);
5104
5105 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5106 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5107 .addReg(Reg, RegState::Kill)
5108 .addImm(0);
5109 // clang-format off
5110 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5111 .addMBB(LoopBB);
5112 // clang-format on
5113
5114 return RemainderBB;
5115}
5116
5117// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5118// wavefront. If the value is uniform and just happens to be in a VGPR, this
5119// will only do one iteration. In the worst case, this will loop 64 times.
5120//
5121// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5124 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5125 const DebugLoc &DL, const MachineOperand &Idx,
5126 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5127 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5128 Register &SGPRIdxReg) {
5129
5130 MachineFunction *MF = OrigBB.getParent();
5131 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5132 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5135
5136 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5137 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5138 Register NewExec = MRI.createVirtualRegister(BoolRC);
5139 Register CurrentIdxReg =
5140 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5141 Register CondReg = MRI.createVirtualRegister(BoolRC);
5142
5143 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5144 .addReg(InitReg)
5145 .addMBB(&OrigBB)
5146 .addReg(ResultReg)
5147 .addMBB(&LoopBB);
5148
5149 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5150 .addReg(InitSaveExecReg)
5151 .addMBB(&OrigBB)
5152 .addReg(NewExec)
5153 .addMBB(&LoopBB);
5154
5155 // Read the next variant <- also loop target.
5156 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5157 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5158
5159 // Compare the just read M0 value to all possible Idx values.
5160 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5161 .addReg(CurrentIdxReg)
5162 .addReg(Idx.getReg(), {}, Idx.getSubReg());
5163
5164 // Update EXEC, save the original EXEC value to VCC.
5165 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5166 .addReg(CondReg, RegState::Kill);
5167
5168 MRI.setSimpleHint(NewExec, CondReg);
5169
5170 if (UseGPRIdxMode) {
5171 if (Offset == 0) {
5172 SGPRIdxReg = CurrentIdxReg;
5173 } else {
5174 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5175 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5176 .addReg(CurrentIdxReg, RegState::Kill)
5177 .addImm(Offset);
5178 }
5179 } else {
5180 // Move index from VCC into M0
5181 if (Offset == 0) {
5182 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5183 .addReg(CurrentIdxReg, RegState::Kill);
5184 } else {
5185 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5186 .addReg(CurrentIdxReg, RegState::Kill)
5187 .addImm(Offset);
5188 }
5189 }
5190
5191 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5192 MachineInstr *InsertPt =
5193 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5194 .addReg(LMC.ExecReg)
5195 .addReg(NewExec);
5196
5197 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5198 // s_cbranch_scc0?
5199
5200 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5201 // clang-format off
5202 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5203 .addMBB(&LoopBB);
5204 // clang-format on
5205
5206 return InsertPt->getIterator();
5207}
5208
5209// This has slightly sub-optimal regalloc when the source vector is killed by
5210// the read. The register allocator does not understand that the kill is
5211// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5212// subregister from it, using 1 more VGPR than necessary. This was saved when
5213// this was expanded after register allocation.
5216 unsigned InitResultReg, unsigned PhiReg, int Offset,
5217 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5218 MachineFunction *MF = MBB.getParent();
5219 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5220 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5222 const DebugLoc &DL = MI.getDebugLoc();
5224
5225 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5226 Register DstReg = MI.getOperand(0).getReg();
5227 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5228 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5230
5231 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5232
5233 // Save the EXEC mask
5234 // clang-format off
5235 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5236 .addReg(LMC.ExecReg);
5237 // clang-format on
5238
5239 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5240
5241 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5242
5243 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5244 InitResultReg, DstReg, PhiReg, TmpExec,
5245 Offset, UseGPRIdxMode, SGPRIdxReg);
5246
5247 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5249 ++MBBI;
5250 MF->insert(MBBI, LandingPad);
5251 LoopBB->removeSuccessor(RemainderBB);
5252 LandingPad->addSuccessor(RemainderBB);
5253 LoopBB->addSuccessor(LandingPad);
5254 MachineBasicBlock::iterator First = LandingPad->begin();
5255 // clang-format off
5256 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5257 .addReg(SaveExec);
5258 // clang-format on
5259
5260 return InsPt;
5261}
5262
5263// Returns subreg index, offset
5264static std::pair<unsigned, int>
5266 const TargetRegisterClass *SuperRC, unsigned VecReg,
5267 int Offset) {
5268 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5269
5270 // Skip out of bounds offsets, or else we would end up using an undefined
5271 // register.
5272 if (Offset >= NumElts || Offset < 0)
5273 return std::pair(AMDGPU::sub0, Offset);
5274
5275 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5276}
5277
5280 int Offset) {
5281 MachineBasicBlock *MBB = MI.getParent();
5282 const DebugLoc &DL = MI.getDebugLoc();
5284
5285 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5286
5287 assert(Idx->getReg() != AMDGPU::NoRegister);
5288
5289 if (Offset == 0) {
5290 // clang-format off
5291 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5292 .add(*Idx);
5293 // clang-format on
5294 } else {
5295 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5296 .add(*Idx)
5297 .addImm(Offset);
5298 }
5299}
5300
5303 int Offset) {
5304 MachineBasicBlock *MBB = MI.getParent();
5305 const DebugLoc &DL = MI.getDebugLoc();
5307
5308 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5309
5310 if (Offset == 0)
5311 return Idx->getReg();
5312
5313 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5314 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5315 .add(*Idx)
5316 .addImm(Offset);
5317 return Tmp;
5318}
5319
5322 const GCNSubtarget &ST) {
5323 const SIInstrInfo *TII = ST.getInstrInfo();
5324 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5325 MachineFunction *MF = MBB.getParent();
5327
5328 Register Dst = MI.getOperand(0).getReg();
5329 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5330 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5331 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5332
5333 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5334 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5335
5336 unsigned SubReg;
5337 std::tie(SubReg, Offset) =
5338 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5339
5340 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5341
5342 // Check for a SGPR index.
5343 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5345 const DebugLoc &DL = MI.getDebugLoc();
5346
5347 if (UseGPRIdxMode) {
5348 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5349 // to avoid interfering with other uses, so probably requires a new
5350 // optimization pass.
5352
5353 const MCInstrDesc &GPRIDXDesc =
5354 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5355 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5356 .addReg(SrcReg)
5357 .addReg(Idx)
5358 .addImm(SubReg);
5359 } else {
5361
5362 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5363 .addReg(SrcReg, {}, SubReg)
5364 .addReg(SrcReg, RegState::Implicit);
5365 }
5366
5367 MI.eraseFromParent();
5368
5369 return &MBB;
5370 }
5371
5372 // Control flow needs to be inserted if indexing with a VGPR.
5373 const DebugLoc &DL = MI.getDebugLoc();
5375
5376 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5377 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5378
5379 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5380
5381 Register SGPRIdxReg;
5382 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5383 UseGPRIdxMode, SGPRIdxReg);
5384
5385 MachineBasicBlock *LoopBB = InsPt->getParent();
5386
5387 if (UseGPRIdxMode) {
5388 const MCInstrDesc &GPRIDXDesc =
5389 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5390
5391 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5392 .addReg(SrcReg)
5393 .addReg(SGPRIdxReg)
5394 .addImm(SubReg);
5395 } else {
5396 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5397 .addReg(SrcReg, {}, SubReg)
5398 .addReg(SrcReg, RegState::Implicit);
5399 }
5400
5401 MI.eraseFromParent();
5402
5403 return LoopBB;
5404}
5405
5408 const GCNSubtarget &ST) {
5409 const SIInstrInfo *TII = ST.getInstrInfo();
5410 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5411 MachineFunction *MF = MBB.getParent();
5413
5414 Register Dst = MI.getOperand(0).getReg();
5415 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5416 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5417 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5418 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5419 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5420 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5421
5422 // This can be an immediate, but will be folded later.
5423 assert(Val->getReg());
5424
5425 unsigned SubReg;
5426 std::tie(SubReg, Offset) =
5427 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5428 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5429
5430 if (Idx->getReg() == AMDGPU::NoRegister) {
5432 const DebugLoc &DL = MI.getDebugLoc();
5433
5434 assert(Offset == 0);
5435
5436 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5437 .add(*SrcVec)
5438 .add(*Val)
5439 .addImm(SubReg);
5440
5441 MI.eraseFromParent();
5442 return &MBB;
5443 }
5444
5445 // Check for a SGPR index.
5446 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5448 const DebugLoc &DL = MI.getDebugLoc();
5449
5450 if (UseGPRIdxMode) {
5452
5453 const MCInstrDesc &GPRIDXDesc =
5454 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5455 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5456 .addReg(SrcVec->getReg())
5457 .add(*Val)
5458 .addReg(Idx)
5459 .addImm(SubReg);
5460 } else {
5462
5463 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5464 TRI.getRegSizeInBits(*VecRC), 32, false);
5465 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5466 .addReg(SrcVec->getReg())
5467 .add(*Val)
5468 .addImm(SubReg);
5469 }
5470 MI.eraseFromParent();
5471 return &MBB;
5472 }
5473
5474 // Control flow needs to be inserted if indexing with a VGPR.
5475 if (Val->isReg())
5476 MRI.clearKillFlags(Val->getReg());
5477
5478 const DebugLoc &DL = MI.getDebugLoc();
5479
5480 Register PhiReg = MRI.createVirtualRegister(VecRC);
5481
5482 Register SGPRIdxReg;
5483 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5484 UseGPRIdxMode, SGPRIdxReg);
5485 MachineBasicBlock *LoopBB = InsPt->getParent();
5486
5487 if (UseGPRIdxMode) {
5488 const MCInstrDesc &GPRIDXDesc =
5489 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5490
5491 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5492 .addReg(PhiReg)
5493 .add(*Val)
5494 .addReg(SGPRIdxReg)
5495 .addImm(SubReg);
5496 } else {
5497 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5498 TRI.getRegSizeInBits(*VecRC), 32, false);
5499 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5500 .addReg(PhiReg)
5501 .add(*Val)
5502 .addImm(SubReg);
5503 }
5504
5505 MI.eraseFromParent();
5506 return LoopBB;
5507}
5508
5510 MachineBasicBlock *BB) {
5511 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5512 // For GFX12, we emit s_add_u64 and s_sub_u64.
5513 MachineFunction *MF = BB->getParent();
5514 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5515 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5517 const DebugLoc &DL = MI.getDebugLoc();
5518 MachineOperand &Dest = MI.getOperand(0);
5519 MachineOperand &Src0 = MI.getOperand(1);
5520 MachineOperand &Src1 = MI.getOperand(2);
5521 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5522 if (ST.hasScalarAddSub64()) {
5523 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5524 // clang-format off
5525 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5526 .add(Src0)
5527 .add(Src1);
5528 // clang-format on
5529 } else {
5530 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5531 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5532
5533 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5534 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5535
5536 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5537 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5538 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5539 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5540
5541 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5542 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5543 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5544 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5545
5546 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5547 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5548 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5549 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5550 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5551 .addReg(DestSub0)
5552 .addImm(AMDGPU::sub0)
5553 .addReg(DestSub1)
5554 .addImm(AMDGPU::sub1);
5555 }
5556 MI.eraseFromParent();
5557 return BB;
5558}
5559
5561 switch (Opc) {
5562 case AMDGPU::S_MIN_U32:
5563 return std::numeric_limits<uint32_t>::max();
5564 case AMDGPU::S_MIN_I32:
5565 return std::numeric_limits<int32_t>::max();
5566 case AMDGPU::S_MAX_U32:
5567 return std::numeric_limits<uint32_t>::min();
5568 case AMDGPU::S_MAX_I32:
5569 return std::numeric_limits<int32_t>::min();
5570 case AMDGPU::V_ADD_F32_e64: // -0.0
5571 return 0x80000000;
5572 case AMDGPU::V_SUB_F32_e64: // +0.0
5573 return 0x0;
5574 case AMDGPU::S_ADD_I32:
5575 case AMDGPU::S_SUB_I32:
5576 case AMDGPU::S_OR_B32:
5577 case AMDGPU::S_XOR_B32:
5578 return std::numeric_limits<uint32_t>::min();
5579 case AMDGPU::S_AND_B32:
5580 return std::numeric_limits<uint32_t>::max();
5581 case AMDGPU::V_MIN_F32_e64:
5582 case AMDGPU::V_MAX_F32_e64:
5583 return 0x7fc00000; // qNAN
5584 default:
5586 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5587 }
5588}
5589
5591 switch (Opc) {
5592 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5593 return std::numeric_limits<uint64_t>::max();
5594 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5595 return std::numeric_limits<int64_t>::max();
5596 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5597 return std::numeric_limits<uint64_t>::min();
5598 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5599 return std::numeric_limits<int64_t>::min();
5600 case AMDGPU::V_MIN_F64_e64:
5601 case AMDGPU::V_MAX_F64_e64:
5602 case AMDGPU::V_MIN_NUM_F64_e64:
5603 case AMDGPU::V_MAX_NUM_F64_e64:
5604 return 0x7FF8000000000000; // qNAN
5605 case AMDGPU::S_ADD_U64_PSEUDO:
5606 case AMDGPU::S_SUB_U64_PSEUDO:
5607 case AMDGPU::S_OR_B64:
5608 case AMDGPU::S_XOR_B64:
5609 return std::numeric_limits<uint64_t>::min();
5610 case AMDGPU::S_AND_B64:
5611 return std::numeric_limits<uint64_t>::max();
5612 case AMDGPU::V_ADD_F64_e64:
5613 case AMDGPU::V_ADD_F64_pseudo_e64:
5614 return 0x8000000000000000; // -0.0
5615 default:
5617 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5618 }
5619}
5620
5621static bool is32bitWaveReduceOperation(unsigned Opc) {
5622 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5623 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5624 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5625 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5626 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5627 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5628 Opc == AMDGPU::V_SUB_F32_e64;
5629}
5630
5632 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5633 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5634 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5635 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5636 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5637}
5638
5641 const GCNSubtarget &ST,
5642 unsigned Opc) {
5644 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5645 const DebugLoc &DL = MI.getDebugLoc();
5646 const SIInstrInfo *TII = ST.getInstrInfo();
5647
5648 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5649 Register SrcReg = MI.getOperand(1).getReg();
5650 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5651 Register DstReg = MI.getOperand(0).getReg();
5652 MachineBasicBlock *RetBB = nullptr;
5653 if (isSGPR) {
5654 switch (Opc) {
5655 case AMDGPU::S_MIN_U32:
5656 case AMDGPU::S_MIN_I32:
5657 case AMDGPU::V_MIN_F32_e64:
5658 case AMDGPU::S_MAX_U32:
5659 case AMDGPU::S_MAX_I32:
5660 case AMDGPU::V_MAX_F32_e64:
5661 case AMDGPU::S_AND_B32:
5662 case AMDGPU::S_OR_B32: {
5663 // Idempotent operations.
5664 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5665 RetBB = &BB;
5666 break;
5667 }
5668 case AMDGPU::V_CMP_LT_U64_e64: // umin
5669 case AMDGPU::V_CMP_LT_I64_e64: // min
5670 case AMDGPU::V_CMP_GT_U64_e64: // umax
5671 case AMDGPU::V_CMP_GT_I64_e64: // max
5672 case AMDGPU::V_MIN_F64_e64:
5673 case AMDGPU::V_MIN_NUM_F64_e64:
5674 case AMDGPU::V_MAX_F64_e64:
5675 case AMDGPU::V_MAX_NUM_F64_e64:
5676 case AMDGPU::S_AND_B64:
5677 case AMDGPU::S_OR_B64: {
5678 // Idempotent operations.
5679 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5680 RetBB = &BB;
5681 break;
5682 }
5683 case AMDGPU::S_XOR_B32:
5684 case AMDGPU::S_XOR_B64:
5685 case AMDGPU::S_ADD_I32:
5686 case AMDGPU::S_ADD_U64_PSEUDO:
5687 case AMDGPU::V_ADD_F32_e64:
5688 case AMDGPU::V_ADD_F64_e64:
5689 case AMDGPU::V_ADD_F64_pseudo_e64:
5690 case AMDGPU::S_SUB_I32:
5691 case AMDGPU::S_SUB_U64_PSEUDO:
5692 case AMDGPU::V_SUB_F32_e64: {
5693 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5694 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5695 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5696 Register NumActiveLanes =
5697 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5698
5699 bool IsWave32 = ST.isWave32();
5700 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5701 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5702 unsigned BitCountOpc =
5703 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5704
5705 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5706
5707 auto NewAccumulator =
5708 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5709 .addReg(ExecMask);
5710
5711 switch (Opc) {
5712 case AMDGPU::S_XOR_B32:
5713 case AMDGPU::S_XOR_B64: {
5714 // Performing an XOR operation on a uniform value
5715 // depends on the parity of the number of active lanes.
5716 // For even parity, the result will be 0, for odd
5717 // parity the result will be the same as the input value.
5718 Register ParityRegister =
5719 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5720
5721 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5722 .addReg(NewAccumulator->getOperand(0).getReg())
5723 .addImm(1)
5724 .setOperandDead(3); // Dead scc
5725 if (Opc == AMDGPU::S_XOR_B32) {
5726 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5727 .addReg(SrcReg)
5728 .addReg(ParityRegister);
5729 } else {
5730 Register DestSub0 =
5731 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5732 Register DestSub1 =
5733 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5734
5735 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5736 const TargetRegisterClass *SrcSubRC =
5737 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5738
5739 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5740 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5741 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5742 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5743
5744 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5745 .add(Op1L)
5746 .addReg(ParityRegister);
5747
5748 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5749 .add(Op1H)
5750 .addReg(ParityRegister);
5751
5752 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5753 .addReg(DestSub0)
5754 .addImm(AMDGPU::sub0)
5755 .addReg(DestSub1)
5756 .addImm(AMDGPU::sub1);
5757 }
5758 break;
5759 }
5760 case AMDGPU::S_SUB_I32: {
5761 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5762
5763 // Take the negation of the source operand.
5764 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5765 .addImm(0)
5766 .addReg(SrcReg);
5767 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5768 .addReg(NegatedVal)
5769 .addReg(NewAccumulator->getOperand(0).getReg());
5770 break;
5771 }
5772 case AMDGPU::S_ADD_I32: {
5773 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5774 .addReg(SrcReg)
5775 .addReg(NewAccumulator->getOperand(0).getReg());
5776 break;
5777 }
5778 case AMDGPU::S_ADD_U64_PSEUDO:
5779 case AMDGPU::S_SUB_U64_PSEUDO: {
5780 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5781 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5782 Register Op1H_Op0L_Reg =
5783 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5784 Register Op1L_Op0H_Reg =
5785 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5786 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5787 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5788 Register NegatedValLo =
5789 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5790 Register NegatedValHi =
5791 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5792
5793 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5794 const TargetRegisterClass *Src1SubRC =
5795 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5796
5797 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5798 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5799 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5800 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5801
5802 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5803 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5804 .addImm(0)
5805 .addReg(NewAccumulator->getOperand(0).getReg())
5806 .setOperandDead(3); // Dead scc
5807 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5808 .addReg(NegatedValLo)
5809 .addImm(31)
5810 .setOperandDead(3); // Dead scc
5811 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5812 .add(Op1L)
5813 .addReg(NegatedValHi);
5814 }
5815 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5816 ? NegatedValLo
5817 : NewAccumulator->getOperand(0).getReg();
5818 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5819 .add(Op1L)
5820 .addReg(LowOpcode);
5821 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5822 .add(Op1L)
5823 .addReg(LowOpcode);
5824 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5825 .add(Op1H)
5826 .addReg(LowOpcode);
5827
5828 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5829 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5830 .addReg(CarryReg)
5831 .addReg(Op1H_Op0L_Reg)
5832 .setOperandDead(3); // Dead scc
5833
5834 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5835 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5836 .addReg(HiVal)
5837 .addReg(Op1L_Op0H_Reg)
5838 .setOperandDead(3); // Dead scc
5839 }
5840 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5841 .addReg(DestSub0)
5842 .addImm(AMDGPU::sub0)
5843 .addReg(DestSub1)
5844 .addImm(AMDGPU::sub1);
5845 break;
5846 }
5847 case AMDGPU::V_ADD_F32_e64:
5848 case AMDGPU::V_ADD_F64_e64:
5849 case AMDGPU::V_ADD_F64_pseudo_e64:
5850 case AMDGPU::V_SUB_F32_e64: {
5851 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5852 const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0);
5853 Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);
5854 Register DstVreg = MRI.createVirtualRegister(VregRC);
5855 // Get number of active lanes as a float val.
5856 BuildMI(BB, MI, DL,
5857 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5858 : AMDGPU::V_CVT_F64_I32_e64),
5859 ActiveLanesVreg)
5860 .addReg(NewAccumulator->getOperand(0).getReg())
5861 .addImm(0) // clamp
5862 .addImm(0); // output-modifier
5863
5864 // Take negation of input for SUB reduction
5865 unsigned srcMod =
5866 (Opc == AMDGPU::V_SUB_F32_e64 ||
5867 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5870 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5871 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
5872 ? AMDGPU::V_MUL_F64_pseudo_e64
5873 : AMDGPU::V_MUL_F64_e64;
5874 auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),
5875 DstVreg)
5876 .addImm(srcMod) // src0 modifier
5877 .addReg(SrcReg)
5878 .addImm(SISrcMods::NONE) // src1 modifier
5879 .addReg(ActiveLanesVreg)
5880 .addImm(SISrcMods::NONE) // clamp
5881 .addImm(SISrcMods::NONE); // output-mod
5882 if (is32BitOpc) {
5883 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5884 .addReg(DstVreg);
5885 } else {
5886 Register LaneValueLoReg =
5887 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5888 Register LaneValueHiReg =
5889 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5890 const TargetRegisterClass *VregSubRC =
5891 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5892 MachineOperand Op1L =
5893 TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
5894 VregRC, AMDGPU::sub0, VregSubRC);
5895 MachineOperand Op1H =
5896 TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
5897 VregRC, AMDGPU::sub1, VregSubRC);
5898 // lane value input should be in an sgpr
5899 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
5900 LaneValueLoReg)
5901 .add(Op1L);
5902 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
5903 LaneValueHiReg)
5904 .add(Op1H);
5905 NewAccumulator =
5906 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5907 .addReg(LaneValueLoReg)
5908 .addImm(AMDGPU::sub0)
5909 .addReg(LaneValueHiReg)
5910 .addImm(AMDGPU::sub1);
5911 }
5912 }
5913 }
5914 RetBB = &BB;
5915 }
5916 }
5917 } else {
5918 // TODO: Implement DPP Strategy and switch based on immediate strategy
5919 // operand. For now, for all the cases (default, Iterative and DPP we use
5920 // iterative approach by default.)
5921
5922 // To reduce the VGPR using iterative approach, we need to iterate
5923 // over all the active lanes. Lowering consists of ComputeLoop,
5924 // which iterate over only active lanes. We use copy of EXEC register
5925 // as induction variable and every active lane modifies it using bitset0
5926 // so that we will get the next active lane for next iteration.
5928 Register SrcReg = MI.getOperand(1).getReg();
5929 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5931
5932 // Create Control flow for loop
5933 // Split MI's Machine Basic block into For loop
5934 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5935
5936 // Create virtual registers required for lowering.
5937 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5938 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5939 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5940 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5941 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5942 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5943 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5944 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5945 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5946
5947 bool IsWave32 = ST.isWave32();
5948 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5949 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5950
5951 // Create initial values of induction variable from Exec, Accumulator and
5952 // insert branch instr to newly created ComputeBlock
5953 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5954 if (is32BitOpc) {
5956 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5957 .addImm(IdentityValue);
5958 } else {
5959 uint64_t IdentityValue =
5960 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5961 ? 0x0 // +0.0 for double sub reduction
5963 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5964 .addImm(IdentityValue);
5965 }
5966 // clang-format off
5967 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5968 .addMBB(ComputeLoop);
5969 // clang-format on
5970
5971 // Start constructing ComputeLoop
5972 I = ComputeLoop->begin();
5973 auto Accumulator =
5974 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5975 .addReg(IdentityValReg)
5976 .addMBB(&BB);
5977 auto ActiveBits =
5978 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5979 .addReg(LoopIterator)
5980 .addMBB(&BB);
5981
5982 I = ComputeLoop->end();
5983 MachineInstr *NewAccumulator;
5984 // Perform the computations
5985 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5986 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5987 .addReg(ActiveBitsReg);
5988 if (is32BitOpc) {
5989 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5990 LaneValueReg)
5991 .addReg(SrcReg)
5992 .addReg(FF1Reg);
5993 if (isFPOp) {
5994 Register LaneValVreg =
5995 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5996 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5997 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5998 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5999 LaneValVreg)
6000 .addReg(LaneValueReg);
6001 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6002 .addImm(0) // src0 modifier
6003 .addReg(Accumulator->getOperand(0).getReg())
6004 .addImm(0) // src1 modifier
6005 .addReg(LaneValVreg)
6006 .addImm(0) // clamp
6007 .addImm(0); // omod
6008 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6009 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6010 .addReg(DstVreg);
6011 } else {
6012 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6013 .addReg(Accumulator->getOperand(0).getReg())
6014 .addReg(LaneValueReg);
6015 }
6016 } else {
6017 Register LaneValueLoReg =
6018 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6019 Register LaneValueHiReg =
6020 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6021 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6022 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
6023 const TargetRegisterClass *SrcSubRC =
6024 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
6025 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
6026 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
6027 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
6028 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
6029 // lane value input should be in an sgpr
6030 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6031 LaneValueLoReg)
6032 .add(Op1L)
6033 .addReg(FF1Reg);
6034 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6035 LaneValueHiReg)
6036 .add(Op1H)
6037 .addReg(FF1Reg);
6038 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
6039 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
6040 .addReg(LaneValueLoReg)
6041 .addImm(AMDGPU::sub0)
6042 .addReg(LaneValueHiReg)
6043 .addImm(AMDGPU::sub1);
6044 switch (Opc) {
6045 case AMDGPU::S_OR_B64:
6046 case AMDGPU::S_AND_B64:
6047 case AMDGPU::S_XOR_B64: {
6048 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6049 .addReg(Accumulator->getOperand(0).getReg())
6050 .addReg(LaneValue->getOperand(0).getReg())
6051 .setOperandDead(3); // Dead scc
6052 break;
6053 }
6054 case AMDGPU::V_CMP_GT_I64_e64:
6055 case AMDGPU::V_CMP_GT_U64_e64:
6056 case AMDGPU::V_CMP_LT_I64_e64:
6057 case AMDGPU::V_CMP_LT_U64_e64: {
6058 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
6059 Register ComparisonResultReg =
6060 MRI.createVirtualRegister(WaveMaskRegClass);
6061 int SrcIdx =
6062 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6063 const TargetRegisterClass *VregClass =
6064 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6065 const TargetRegisterClass *VSubRegClass =
6066 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6067 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
6068 MachineOperand SrcReg0Sub0 =
6069 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
6070 VregClass, AMDGPU::sub0, VSubRegClass);
6071 MachineOperand SrcReg0Sub1 =
6072 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
6073 VregClass, AMDGPU::sub1, VSubRegClass);
6074 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
6075 AccumulatorVReg)
6076 .add(SrcReg0Sub0)
6077 .addImm(AMDGPU::sub0)
6078 .add(SrcReg0Sub1)
6079 .addImm(AMDGPU::sub1);
6080 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
6081 .addReg(LaneValue->getOperand(0).getReg())
6082 .addReg(AccumulatorVReg);
6083
6084 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6085 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
6086 .addReg(LaneMaskReg)
6087 .addReg(ActiveBitsReg);
6088
6089 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6090 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6091 .addReg(LaneValue->getOperand(0).getReg())
6092 .addReg(Accumulator->getOperand(0).getReg());
6093 break;
6094 }
6095 case AMDGPU::V_MIN_F64_e64:
6096 case AMDGPU::V_MIN_NUM_F64_e64:
6097 case AMDGPU::V_MAX_F64_e64:
6098 case AMDGPU::V_MAX_NUM_F64_e64:
6099 case AMDGPU::V_ADD_F64_e64:
6100 case AMDGPU::V_ADD_F64_pseudo_e64: {
6101 int SrcIdx =
6102 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6103 const TargetRegisterClass *VregRC =
6104 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6105 const TargetRegisterClass *VregSubRC =
6106 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6107 Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
6108 Register DstVreg = MRI.createVirtualRegister(VregRC);
6109 Register LaneValLo =
6110 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6111 Register LaneValHi =
6112 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6113 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
6114 .addReg(Accumulator->getOperand(0).getReg());
6115 unsigned Modifier =
6116 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6119 auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6120 .addImm(Modifier) // src0 modifiers
6121 .addReg(LaneValue->getOperand(0).getReg())
6122 .addImm(SISrcMods::NONE) // src1 modifiers
6123 .addReg(AccumulatorVReg)
6124 .addImm(SISrcMods::NONE) // clamp
6125 .addImm(SISrcMods::NONE); // omod
6126 auto ReadLaneLo =
6127 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6128 LaneValLo);
6129 auto ReadLaneHi =
6130 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6131 LaneValHi);
6132 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6133 MachineOperand Op1L =
6134 TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
6135 VregRC, AMDGPU::sub0, VregSubRC);
6136 MachineOperand Op1H =
6137 TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
6138 VregRC, AMDGPU::sub1, VregSubRC);
6139 ReadLaneLo.add(Op1L);
6140 ReadLaneHi.add(Op1H);
6141 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6142 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
6143 .addReg(LaneValLo)
6144 .addImm(AMDGPU::sub0)
6145 .addReg(LaneValHi)
6146 .addImm(AMDGPU::sub1);
6147 break;
6148 }
6149 case AMDGPU::S_ADD_U64_PSEUDO:
6150 case AMDGPU::S_SUB_U64_PSEUDO: {
6151 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6152 .addReg(Accumulator->getOperand(0).getReg())
6153 .addReg(LaneValue->getOperand(0).getReg());
6154 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
6155 break;
6156 }
6157 }
6158 }
6159 // Manipulate the iterator to get the next active lane
6160 unsigned BITSETOpc =
6161 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6162 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
6163 .addReg(FF1Reg)
6164 .addReg(ActiveBitsReg);
6165
6166 // Add phi nodes
6167 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
6168 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6169
6170 // Creating branching
6171 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6172 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
6173 .addReg(NewActiveBitsReg)
6174 .addImm(0);
6175 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6176 .addMBB(ComputeLoop);
6177
6178 RetBB = ComputeEnd;
6179 }
6180 MI.eraseFromParent();
6181 return RetBB;
6182}
6183
6186 MachineBasicBlock *BB) const {
6187 MachineFunction *MF = BB->getParent();
6189 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6191 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6193 const DebugLoc &DL = MI.getDebugLoc();
6194
6195 switch (MI.getOpcode()) {
6196 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6197 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
6198 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6199 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
6200 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6201 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
6202 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6203 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6204 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6205 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6206 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6207 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6208 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6209 ? AMDGPU::V_MIN_NUM_F64_e64
6210 : AMDGPU::V_MIN_F64_e64);
6211 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6212 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6213 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6214 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6215 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6216 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6217 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6218 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6219 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6220 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6221 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6222 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6223 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6224 ? AMDGPU::V_MAX_NUM_F64_e64
6225 : AMDGPU::V_MAX_F64_e64);
6226 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6227 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6228 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6229 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6230 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6231 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6232 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6233 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6234 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6235 ? AMDGPU::V_ADD_F64_pseudo_e64
6236 : AMDGPU::V_ADD_F64_e64);
6237 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6238 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6239 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6240 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6241 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6242 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6243 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6244 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6245 // fadd + neg, by setting the NEG bit in the instruction.
6246 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6247 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6248 ? AMDGPU::V_ADD_F64_pseudo_e64
6249 : AMDGPU::V_ADD_F64_e64);
6250 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6251 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6252 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6253 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6254 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6255 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6256 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6257 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6258 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6259 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6260 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6261 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6262 case AMDGPU::S_UADDO_PSEUDO:
6263 case AMDGPU::S_USUBO_PSEUDO: {
6264 MachineOperand &Dest0 = MI.getOperand(0);
6265 MachineOperand &Dest1 = MI.getOperand(1);
6266 MachineOperand &Src0 = MI.getOperand(2);
6267 MachineOperand &Src1 = MI.getOperand(3);
6268
6269 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6270 ? AMDGPU::S_ADD_U32
6271 : AMDGPU::S_SUB_U32;
6272 // clang-format off
6273 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6274 .add(Src0)
6275 .add(Src1);
6276 // clang-format on
6277
6278 unsigned SelOpc =
6279 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6280 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6281
6282 MI.eraseFromParent();
6283 return BB;
6284 }
6285 case AMDGPU::S_ADD_U64_PSEUDO:
6286 case AMDGPU::S_SUB_U64_PSEUDO: {
6287 return Expand64BitScalarArithmetic(MI, BB);
6288 }
6289 case AMDGPU::V_ADD_U64_PSEUDO:
6290 case AMDGPU::V_SUB_U64_PSEUDO: {
6291 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6292
6293 MachineOperand &Dest = MI.getOperand(0);
6294 MachineOperand &Src0 = MI.getOperand(1);
6295 MachineOperand &Src1 = MI.getOperand(2);
6296
6297 if (ST.hasAddSubU64Insts()) {
6298 auto I = BuildMI(*BB, MI, DL,
6299 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6300 : AMDGPU::V_SUB_U64_e64),
6301 Dest.getReg())
6302 .add(Src0)
6303 .add(Src1)
6304 .addImm(0); // clamp
6305 TII->legalizeOperands(*I);
6306 MI.eraseFromParent();
6307 return BB;
6308 }
6309
6310 if (IsAdd && ST.hasLshlAddU64Inst()) {
6311 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6312 Dest.getReg())
6313 .add(Src0)
6314 .addImm(0)
6315 .add(Src1);
6316 TII->legalizeOperands(*Add);
6317 MI.eraseFromParent();
6318 return BB;
6319 }
6320
6321 const auto *CarryRC = TRI->getWaveMaskRegClass();
6322
6323 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6324 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6325
6326 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6327 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6328
6329 const TargetRegisterClass *Src0RC = Src0.isReg()
6330 ? MRI.getRegClass(Src0.getReg())
6331 : &AMDGPU::VReg_64RegClass;
6332 const TargetRegisterClass *Src1RC = Src1.isReg()
6333 ? MRI.getRegClass(Src1.getReg())
6334 : &AMDGPU::VReg_64RegClass;
6335
6336 const TargetRegisterClass *Src0SubRC =
6337 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6338 const TargetRegisterClass *Src1SubRC =
6339 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6340
6341 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6342 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6343 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6344 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6345
6346 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6347 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6348 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6349 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6350
6351 unsigned LoOpc =
6352 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6353 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6354 .addReg(CarryReg, RegState::Define)
6355 .add(SrcReg0Sub0)
6356 .add(SrcReg1Sub0)
6357 .addImm(0); // clamp bit
6358
6359 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6360 MachineInstr *HiHalf =
6361 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6362 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6363 .add(SrcReg0Sub1)
6364 .add(SrcReg1Sub1)
6365 .addReg(CarryReg, RegState::Kill)
6366 .addImm(0); // clamp bit
6367
6368 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6369 .addReg(DestSub0)
6370 .addImm(AMDGPU::sub0)
6371 .addReg(DestSub1)
6372 .addImm(AMDGPU::sub1);
6373 TII->legalizeOperands(*LoHalf);
6374 TII->legalizeOperands(*HiHalf);
6375 MI.eraseFromParent();
6376 return BB;
6377 }
6378 case AMDGPU::S_ADD_CO_PSEUDO:
6379 case AMDGPU::S_SUB_CO_PSEUDO: {
6380 // This pseudo has a chance to be selected
6381 // only from uniform add/subcarry node. All the VGPR operands
6382 // therefore assumed to be splat vectors.
6384 MachineOperand &Dest = MI.getOperand(0);
6385 MachineOperand &CarryDest = MI.getOperand(1);
6386 MachineOperand &Src0 = MI.getOperand(2);
6387 MachineOperand &Src1 = MI.getOperand(3);
6388 MachineOperand &Src2 = MI.getOperand(4);
6389 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6390 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6391 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6392 .addReg(Src0.getReg());
6393 Src0.setReg(RegOp0);
6394 }
6395 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6396 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6397 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6398 .addReg(Src1.getReg());
6399 Src1.setReg(RegOp1);
6400 }
6401 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6402 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6403 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6404 .addReg(Src2.getReg());
6405 Src2.setReg(RegOp2);
6406 }
6407
6408 if (ST.isWave64()) {
6409 if (ST.hasScalarCompareEq64()) {
6410 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6411 .addReg(Src2.getReg())
6412 .addImm(0);
6413 } else {
6414 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6415 const TargetRegisterClass *SubRC =
6416 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6417 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6418 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6419 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6420 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6421 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6422
6423 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6424 .add(Src2Sub0)
6425 .add(Src2Sub1);
6426
6427 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6428 .addReg(Src2_32, RegState::Kill)
6429 .addImm(0);
6430 }
6431 } else {
6432 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6433 .addReg(Src2.getReg())
6434 .addImm(0);
6435 }
6436
6437 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6438 ? AMDGPU::S_ADDC_U32
6439 : AMDGPU::S_SUBB_U32;
6440
6441 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6442
6443 unsigned SelOpc =
6444 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6445
6446 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6447 .addImm(-1)
6448 .addImm(0);
6449
6450 MI.eraseFromParent();
6451 return BB;
6452 }
6453 case AMDGPU::SI_INIT_M0: {
6454 MachineOperand &M0Init = MI.getOperand(0);
6455 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6456 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6457 AMDGPU::M0)
6458 .add(M0Init);
6459 MI.eraseFromParent();
6460 return BB;
6461 }
6462 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6463 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6464 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6465 TII->get(AMDGPU::S_CMP_EQ_U32))
6466 .addImm(0)
6467 .addImm(0);
6468 return BB;
6469 }
6470 case AMDGPU::GET_GROUPSTATICSIZE: {
6471 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6472 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6473 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6474 .add(MI.getOperand(0))
6475 .addImm(MFI->getLDSSize());
6476 MI.eraseFromParent();
6477 return BB;
6478 }
6479 case AMDGPU::GET_SHADERCYCLESHILO: {
6480 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
6481 // The algorithm is:
6482 //
6483 // hi1 = getreg(SHADER_CYCLES_HI)
6484 // lo1 = getreg(SHADER_CYCLES_LO)
6485 // hi2 = getreg(SHADER_CYCLES_HI)
6486 //
6487 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6488 // Otherwise there was overflow and the result is hi2:0. In both cases the
6489 // result should represent the actual time at some point during the sequence
6490 // of three getregs.
6491 using namespace AMDGPU::Hwreg;
6492 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6493 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6494 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6495 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6496 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6497 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6498 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6499 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6500 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6501 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6502 .addReg(RegHi1)
6503 .addReg(RegHi2);
6504 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6505 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6506 .addReg(RegLo1)
6507 .addImm(0);
6508 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6509 .add(MI.getOperand(0))
6510 .addReg(RegLo)
6511 .addImm(AMDGPU::sub0)
6512 .addReg(RegHi2)
6513 .addImm(AMDGPU::sub1);
6514 MI.eraseFromParent();
6515 return BB;
6516 }
6517 case AMDGPU::SI_INDIRECT_SRC_V1:
6518 case AMDGPU::SI_INDIRECT_SRC_V2:
6519 case AMDGPU::SI_INDIRECT_SRC_V3:
6520 case AMDGPU::SI_INDIRECT_SRC_V4:
6521 case AMDGPU::SI_INDIRECT_SRC_V5:
6522 case AMDGPU::SI_INDIRECT_SRC_V6:
6523 case AMDGPU::SI_INDIRECT_SRC_V7:
6524 case AMDGPU::SI_INDIRECT_SRC_V8:
6525 case AMDGPU::SI_INDIRECT_SRC_V9:
6526 case AMDGPU::SI_INDIRECT_SRC_V10:
6527 case AMDGPU::SI_INDIRECT_SRC_V11:
6528 case AMDGPU::SI_INDIRECT_SRC_V12:
6529 case AMDGPU::SI_INDIRECT_SRC_V16:
6530 case AMDGPU::SI_INDIRECT_SRC_V32:
6531 return emitIndirectSrc(MI, *BB, *getSubtarget());
6532 case AMDGPU::SI_INDIRECT_DST_V1:
6533 case AMDGPU::SI_INDIRECT_DST_V2:
6534 case AMDGPU::SI_INDIRECT_DST_V3:
6535 case AMDGPU::SI_INDIRECT_DST_V4:
6536 case AMDGPU::SI_INDIRECT_DST_V5:
6537 case AMDGPU::SI_INDIRECT_DST_V6:
6538 case AMDGPU::SI_INDIRECT_DST_V7:
6539 case AMDGPU::SI_INDIRECT_DST_V8:
6540 case AMDGPU::SI_INDIRECT_DST_V9:
6541 case AMDGPU::SI_INDIRECT_DST_V10:
6542 case AMDGPU::SI_INDIRECT_DST_V11:
6543 case AMDGPU::SI_INDIRECT_DST_V12:
6544 case AMDGPU::SI_INDIRECT_DST_V16:
6545 case AMDGPU::SI_INDIRECT_DST_V32:
6546 return emitIndirectDst(MI, *BB, *getSubtarget());
6547 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6548 case AMDGPU::SI_KILL_I1_PSEUDO:
6549 return splitKillBlock(MI, BB);
6550 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6551 Register Dst = MI.getOperand(0).getReg();
6552 const MachineOperand &Src0 = MI.getOperand(1);
6553 const MachineOperand &Src1 = MI.getOperand(2);
6554 Register SrcCond = MI.getOperand(3).getReg();
6555
6556 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6557 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6558 const auto *CondRC = TRI->getWaveMaskRegClass();
6559 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6560
6561 const TargetRegisterClass *Src0RC = Src0.isReg()
6562 ? MRI.getRegClass(Src0.getReg())
6563 : &AMDGPU::VReg_64RegClass;
6564 const TargetRegisterClass *Src1RC = Src1.isReg()
6565 ? MRI.getRegClass(Src1.getReg())
6566 : &AMDGPU::VReg_64RegClass;
6567
6568 const TargetRegisterClass *Src0SubRC =
6569 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6570 const TargetRegisterClass *Src1SubRC =
6571 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6572
6573 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6574 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6575 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6576 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6577
6578 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6579 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6580 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6581 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6582
6583 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6584 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6585 .addImm(0)
6586 .add(Src0Sub0)
6587 .addImm(0)
6588 .add(Src1Sub0)
6589 .addReg(SrcCondCopy);
6590 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6591 .addImm(0)
6592 .add(Src0Sub1)
6593 .addImm(0)
6594 .add(Src1Sub1)
6595 .addReg(SrcCondCopy);
6596
6597 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6598 .addReg(DstLo)
6599 .addImm(AMDGPU::sub0)
6600 .addReg(DstHi)
6601 .addImm(AMDGPU::sub1);
6602 MI.eraseFromParent();
6603 return BB;
6604 }
6605 case AMDGPU::SI_BR_UNDEF: {
6606 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6607 .add(MI.getOperand(0));
6608 Br->getOperand(1).setIsUndef(); // read undef SCC
6609 MI.eraseFromParent();
6610 return BB;
6611 }
6612 case AMDGPU::ADJCALLSTACKUP:
6613 case AMDGPU::ADJCALLSTACKDOWN: {
6615 MachineInstrBuilder MIB(*MF, &MI);
6616 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6617 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6618 return BB;
6619 }
6620 case AMDGPU::SI_CALL_ISEL: {
6621 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6622
6624 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6625
6626 for (const MachineOperand &MO : MI.operands())
6627 MIB.add(MO);
6628
6629 MIB.cloneMemRefs(MI);
6630 MI.eraseFromParent();
6631 return BB;
6632 }
6633 case AMDGPU::V_ADD_CO_U32_e32:
6634 case AMDGPU::V_SUB_CO_U32_e32:
6635 case AMDGPU::V_SUBREV_CO_U32_e32: {
6636 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6637 unsigned Opc = MI.getOpcode();
6638
6639 bool NeedClampOperand = false;
6640 if (TII->pseudoToMCOpcode(Opc) == -1) {
6642 NeedClampOperand = true;
6643 }
6644
6645 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6646 if (TII->isVOP3(*I)) {
6647 I.addReg(TRI->getVCC(), RegState::Define);
6648 }
6649 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6650 if (NeedClampOperand)
6651 I.addImm(0); // clamp bit for e64 encoding
6652
6653 TII->legalizeOperands(*I);
6654
6655 MI.eraseFromParent();
6656 return BB;
6657 }
6658 case AMDGPU::V_ADDC_U32_e32:
6659 case AMDGPU::V_SUBB_U32_e32:
6660 case AMDGPU::V_SUBBREV_U32_e32:
6661 // These instructions have an implicit use of vcc which counts towards the
6662 // constant bus limit.
6663 TII->legalizeOperands(MI);
6664 return BB;
6665 case AMDGPU::DS_GWS_INIT:
6666 case AMDGPU::DS_GWS_SEMA_BR:
6667 case AMDGPU::DS_GWS_BARRIER:
6668 case AMDGPU::DS_GWS_SEMA_V:
6669 case AMDGPU::DS_GWS_SEMA_P:
6670 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6671 // A s_waitcnt 0 is required to be the instruction immediately following.
6672 if (getSubtarget()->hasGWSAutoReplay()) {
6674 return BB;
6675 }
6676
6677 return emitGWSMemViolTestLoop(MI, BB);
6678 case AMDGPU::S_SETREG_B32: {
6679 // Try to optimize cases that only set the denormal mode or rounding mode.
6680 //
6681 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6682 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6683 // instead.
6684 //
6685 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6686 // allow you to have a no side effect instruction in the output of a
6687 // sideeffecting pattern.
6688 auto [ID, Offset, Width] =
6689 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6691 return BB;
6692
6693 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6694 const unsigned SetMask = WidthMask << Offset;
6695
6696 if (getSubtarget()->hasDenormModeInst()) {
6697 unsigned SetDenormOp = 0;
6698 unsigned SetRoundOp = 0;
6699
6700 // The dedicated instructions can only set the whole denorm or round mode
6701 // at once, not a subset of bits in either.
6702 if (SetMask ==
6704 // If this fully sets both the round and denorm mode, emit the two
6705 // dedicated instructions for these.
6706 SetRoundOp = AMDGPU::S_ROUND_MODE;
6707 SetDenormOp = AMDGPU::S_DENORM_MODE;
6708 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6709 SetRoundOp = AMDGPU::S_ROUND_MODE;
6710 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6711 SetDenormOp = AMDGPU::S_DENORM_MODE;
6712 }
6713
6714 if (SetRoundOp || SetDenormOp) {
6715 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6716 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6717 unsigned ImmVal = Def->getOperand(1).getImm();
6718 if (SetRoundOp) {
6719 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6720 .addImm(ImmVal & 0xf);
6721
6722 // If we also have the denorm mode, get just the denorm mode bits.
6723 ImmVal >>= 4;
6724 }
6725
6726 if (SetDenormOp) {
6727 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6728 .addImm(ImmVal & 0xf);
6729 }
6730
6731 MI.eraseFromParent();
6732 return BB;
6733 }
6734 }
6735 }
6736
6737 // If only FP bits are touched, used the no side effects pseudo.
6738 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6739 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6740 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6741
6742 return BB;
6743 }
6744 case AMDGPU::S_INVERSE_BALLOT_U32:
6745 case AMDGPU::S_INVERSE_BALLOT_U64:
6746 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6747 // necessary. After that they are equivalent to a COPY.
6748 MI.setDesc(TII->get(AMDGPU::COPY));
6749 return BB;
6750 case AMDGPU::ENDPGM_TRAP: {
6751 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6752 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6753 MI.addOperand(MachineOperand::CreateImm(0));
6754 return BB;
6755 }
6756
6757 // We need a block split to make the real endpgm a terminator. We also don't
6758 // want to break phis in successor blocks, so we can't just delete to the
6759 // end of the block.
6760
6761 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6763 MF->push_back(TrapBB);
6764 // clang-format off
6765 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6766 .addImm(0);
6767 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6768 .addMBB(TrapBB);
6769 // clang-format on
6770
6771 BB->addSuccessor(TrapBB);
6772 MI.eraseFromParent();
6773 return SplitBB;
6774 }
6775 case AMDGPU::SIMULATED_TRAP: {
6776 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6777 MachineBasicBlock *SplitBB =
6778 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6779 MI.eraseFromParent();
6780 return SplitBB;
6781 }
6782 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6783 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6785
6786 // During ISel, it's difficult to propagate the original EXEC mask to use as
6787 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6788 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6789 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6790 Register OriginalExec = Setup->getOperand(0).getReg();
6791 MF->getRegInfo().clearKillFlags(OriginalExec);
6792 MI.getOperand(0).setReg(OriginalExec);
6793 return BB;
6794 }
6795 default:
6796 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6797 if (!MI.mayStore())
6799 return BB;
6800 }
6802 }
6803}
6804
6806 // This currently forces unfolding various combinations of fsub into fma with
6807 // free fneg'd operands. As long as we have fast FMA (controlled by
6808 // isFMAFasterThanFMulAndFAdd), we should perform these.
6809
6810 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6811 // most of these combines appear to be cycle neutral but save on instruction
6812 // count / code size.
6813 return true;
6814}
6815
6817
6819 EVT VT) const {
6820 if (!VT.isVector()) {
6821 return MVT::i1;
6822 }
6823 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6824}
6825
6827 // TODO: Should i16 be used always if legal? For now it would force VALU
6828 // shifts.
6829 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6830}
6831
6833 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6834 ? Ty.changeElementSize(16)
6835 : Ty.changeElementSize(32);
6836}
6837
6838// Answering this is somewhat tricky and depends on the specific device which
6839// have different rates for fma or all f64 operations.
6840//
6841// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6842// regardless of which device (although the number of cycles differs between
6843// devices), so it is always profitable for f64.
6844//
6845// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6846// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6847// which we can always do even without fused FP ops since it returns the same
6848// result as the separate operations and since it is always full
6849// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6850// however does not support denormals, so we do report fma as faster if we have
6851// a fast fma device and require denormals.
6852//
6854 EVT VT) const {
6855 VT = VT.getScalarType();
6856
6857 switch (VT.getSimpleVT().SimpleTy) {
6858 case MVT::f32: {
6859 // If mad is not available this depends only on if f32 fma is full rate.
6860 if (!Subtarget->hasMadMacF32Insts())
6861 return Subtarget->hasFastFMAF32();
6862
6863 // Otherwise f32 mad is always full rate and returns the same result as
6864 // the separate operations so should be preferred over fma.
6865 // However does not support denormals.
6867 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6868
6869 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6870 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6871 }
6872 case MVT::f64:
6873 return true;
6874 case MVT::f16:
6875 case MVT::bf16:
6876 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6877 default:
6878 break;
6879 }
6880
6881 return false;
6882}
6883
6885 LLT Ty) const {
6886 switch (Ty.getScalarSizeInBits()) {
6887 case 16:
6888 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6889 case 32:
6890 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6891 case 64:
6892 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6893 default:
6894 break;
6895 }
6896
6897 return false;
6898}
6899
6901 if (!Ty.isScalar())
6902 return false;
6903
6904 if (Ty.getScalarSizeInBits() == 16)
6905 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6906 if (Ty.getScalarSizeInBits() == 32)
6907 return Subtarget->hasMadMacF32Insts() &&
6908 denormalModeIsFlushAllF32(*MI.getMF());
6909
6910 return false;
6911}
6912
6914 const SDNode *N) const {
6915 // TODO: Check future ftz flag
6916 // v_mad_f32/v_mac_f32 do not support denormals.
6917 EVT VT = N->getValueType(0);
6918 if (VT == MVT::f32)
6919 return Subtarget->hasMadMacF32Insts() &&
6921 if (VT == MVT::f16) {
6922 return Subtarget->hasMadF16() &&
6924 }
6925
6926 return false;
6927}
6928
6929//===----------------------------------------------------------------------===//
6930// Custom DAG Lowering Operations
6931//===----------------------------------------------------------------------===//
6932
6933// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6934// wider vector type is legal.
6936 SelectionDAG &DAG) const {
6937 unsigned Opc = Op.getOpcode();
6938 EVT VT = Op.getValueType();
6939 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6940 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6941 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6942 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6943 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6944 VT == MVT::v32bf16);
6945
6946 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6947
6948 SDLoc SL(Op);
6949 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6950 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6951
6952 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6953}
6954
6955// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6956// regression whereby extra unnecessary instructions were added to codegen
6957// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6958// instructions to extract the result from the vector.
6960 [[maybe_unused]] EVT VT = Op.getValueType();
6961
6962 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6963 VT == MVT::v16i32) &&
6964 "Unexpected ValueType.");
6965
6966 return DAG.UnrollVectorOp(Op.getNode());
6967}
6968
6969// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6970// wider vector type is legal.
6972 SelectionDAG &DAG) const {
6973 unsigned Opc = Op.getOpcode();
6974 EVT VT = Op.getValueType();
6975 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6976 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6977 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6978 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6979 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6980 VT == MVT::v32bf16);
6981
6982 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6983 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6984
6985 SDLoc SL(Op);
6986
6987 SDValue OpLo =
6988 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6989 SDValue OpHi =
6990 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6991
6992 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6993}
6994
6996 SelectionDAG &DAG) const {
6997 unsigned Opc = Op.getOpcode();
6998 EVT VT = Op.getValueType();
6999 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7000 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
7001 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7002 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
7003 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
7004 VT == MVT::v32bf16);
7005
7006 SDValue Op0 = Op.getOperand(0);
7007 auto [Lo0, Hi0] = Op0.getValueType().isVector()
7008 ? DAG.SplitVectorOperand(Op.getNode(), 0)
7009 : std::pair(Op0, Op0);
7010
7011 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
7012 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
7013
7014 SDLoc SL(Op);
7015 auto ResVT = DAG.GetSplitDestVTs(VT);
7016
7017 SDValue OpLo =
7018 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
7019 SDValue OpHi =
7020 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
7021
7022 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7023}
7024
7026 switch (Op.getOpcode()) {
7027 default:
7029 case ISD::BRCOND:
7030 return LowerBRCOND(Op, DAG);
7031 case ISD::RETURNADDR:
7032 return LowerRETURNADDR(Op, DAG);
7033 case ISD::SPONENTRY:
7034 return LowerSPONENTRY(Op, DAG);
7035 case ISD::LOAD: {
7036 SDValue Result = LowerLOAD(Op, DAG);
7037 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7038 "Load should return a value and a chain");
7039 return Result;
7040 }
7041 case ISD::FSQRT: {
7042 EVT VT = Op.getValueType();
7043 if (VT == MVT::f32)
7044 return lowerFSQRTF32(Op, DAG);
7045 if (VT == MVT::f64)
7046 return lowerFSQRTF64(Op, DAG);
7047 return SDValue();
7048 }
7049 case ISD::FSIN:
7050 case ISD::FCOS:
7051 return LowerTrig(Op, DAG);
7052 case ISD::SELECT:
7053 return LowerSELECT(Op, DAG);
7054 case ISD::FDIV:
7055 return LowerFDIV(Op, DAG);
7056 case ISD::FFREXP:
7057 return LowerFFREXP(Op, DAG);
7059 return LowerATOMIC_CMP_SWAP(Op, DAG);
7060 case ISD::STORE:
7061 return LowerSTORE(Op, DAG);
7062 case ISD::GlobalAddress: {
7065 return LowerGlobalAddress(MFI, Op, DAG);
7066 }
7068 return LowerExternalSymbol(Op, DAG);
7070 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7072 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7074 return LowerINTRINSIC_VOID(Op, DAG);
7075 case ISD::ADDRSPACECAST:
7076 return lowerADDRSPACECAST(Op, DAG);
7078 return lowerINSERT_SUBVECTOR(Op, DAG);
7080 return lowerINSERT_VECTOR_ELT(Op, DAG);
7082 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7084 return lowerVECTOR_SHUFFLE(Op, DAG);
7086 return lowerSCALAR_TO_VECTOR(Op, DAG);
7087 case ISD::BUILD_VECTOR:
7088 return lowerBUILD_VECTOR(Op, DAG);
7089 case ISD::FP_ROUND:
7091 return lowerFP_ROUND(Op, DAG);
7092 case ISD::TRAP:
7093 return lowerTRAP(Op, DAG);
7094 case ISD::DEBUGTRAP:
7095 return lowerDEBUGTRAP(Op, DAG);
7096 case ISD::ABS:
7097 case ISD::FABS:
7098 case ISD::FNEG:
7099 case ISD::FCANONICALIZE:
7100 case ISD::BSWAP:
7101 return splitUnaryVectorOp(Op, DAG);
7102 case ISD::FMINNUM:
7103 case ISD::FMAXNUM:
7104 return lowerFMINNUM_FMAXNUM(Op, DAG);
7105 case ISD::FMINIMUMNUM:
7106 case ISD::FMAXIMUMNUM:
7107 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7108 case ISD::FMINIMUM:
7109 case ISD::FMAXIMUM:
7110 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7111 case ISD::FLDEXP:
7112 case ISD::STRICT_FLDEXP:
7113 return lowerFLDEXP(Op, DAG);
7114 case ISD::FMA:
7115 return splitTernaryVectorOp(Op, DAG);
7116 case ISD::FP_TO_SINT:
7117 case ISD::FP_TO_UINT:
7118 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
7119 Op.getValueType() == MVT::i16 &&
7120 Op.getOperand(0).getValueType() == MVT::f32) {
7121 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7122 return Op;
7123 }
7124 return LowerFP_TO_INT(Op, DAG);
7125 case ISD::SHL:
7126 case ISD::SRA:
7127 case ISD::SRL:
7128 case ISD::ADD:
7129 case ISD::SUB:
7130 case ISD::SMIN:
7131 case ISD::SMAX:
7132 case ISD::UMIN:
7133 case ISD::UMAX:
7134 case ISD::FADD:
7135 case ISD::FMUL:
7136 case ISD::FMINNUM_IEEE:
7137 case ISD::FMAXNUM_IEEE:
7138 case ISD::UADDSAT:
7139 case ISD::USUBSAT:
7140 case ISD::SADDSAT:
7141 case ISD::SSUBSAT:
7142 return splitBinaryVectorOp(Op, DAG);
7143 case ISD::FCOPYSIGN:
7144 return lowerFCOPYSIGN(Op, DAG);
7145 case ISD::MUL:
7146 return lowerMUL(Op, DAG);
7147 case ISD::SMULO:
7148 case ISD::UMULO:
7149 return lowerXMULO(Op, DAG);
7150 case ISD::SMUL_LOHI:
7151 case ISD::UMUL_LOHI:
7152 return lowerXMUL_LOHI(Op, DAG);
7154 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7155 case ISD::STACKSAVE:
7156 return LowerSTACKSAVE(Op, DAG);
7157 case ISD::GET_ROUNDING:
7158 return lowerGET_ROUNDING(Op, DAG);
7159 case ISD::SET_ROUNDING:
7160 return lowerSET_ROUNDING(Op, DAG);
7161 case ISD::PREFETCH:
7162 return lowerPREFETCH(Op, DAG);
7163 case ISD::FP_EXTEND:
7165 return lowerFP_EXTEND(Op, DAG);
7166 case ISD::GET_FPENV:
7167 return lowerGET_FPENV(Op, DAG);
7168 case ISD::SET_FPENV:
7169 return lowerSET_FPENV(Op, DAG);
7170 case ISD::ROTR:
7171 return lowerROTR(Op, DAG);
7172 }
7173 return SDValue();
7174}
7175
7176// Used for D16: Casts the result of an instruction into the right vector,
7177// packs values if loads return unpacked values.
7179 const SDLoc &DL, SelectionDAG &DAG,
7180 bool Unpacked) {
7181 if (!LoadVT.isVector())
7182 return Result;
7183
7184 // Cast back to the original packed type or to a larger type that is a
7185 // multiple of 32 bit for D16. Widening the return type is a required for
7186 // legalization.
7187 EVT FittingLoadVT = LoadVT;
7188 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7189 FittingLoadVT =
7191 LoadVT.getVectorNumElements() + 1);
7192 }
7193
7194 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7195 // Truncate to v2i16/v4i16.
7196 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7197
7198 // Workaround legalizer not scalarizing truncate after vector op
7199 // legalization but not creating intermediate vector trunc.
7201 DAG.ExtractVectorElements(Result, Elts);
7202 for (SDValue &Elt : Elts)
7203 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7204
7205 // Pad illegal v1i16/v3fi6 to v4i16
7206 if ((LoadVT.getVectorNumElements() % 2) == 1)
7207 Elts.push_back(DAG.getPOISON(MVT::i16));
7208
7209 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
7210
7211 // Bitcast to original type (v2f16/v4f16).
7212 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7213 }
7214
7215 // Cast back to the original packed type.
7216 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7217}
7218
7219SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7220 SelectionDAG &DAG,
7222 bool IsIntrinsic) const {
7223 SDLoc DL(M);
7224
7225 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7226 EVT LoadVT = M->getValueType(0);
7227
7228 EVT EquivLoadVT = LoadVT;
7229 if (LoadVT.isVector()) {
7230 if (Unpacked) {
7231 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7232 LoadVT.getVectorNumElements());
7233 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7234 // Widen v3f16 to legal type
7235 EquivLoadVT =
7237 LoadVT.getVectorNumElements() + 1);
7238 }
7239 }
7240
7241 // Change from v4f16/v2f16 to EquivLoadVT.
7242 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7243
7245 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7246 M->getMemoryVT(), M->getMemOperand());
7247
7248 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7249
7250 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7251}
7252
7253SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7254 SelectionDAG &DAG,
7255 ArrayRef<SDValue> Ops) const {
7256 SDLoc DL(M);
7257 EVT LoadVT = M->getValueType(0);
7258 EVT EltType = LoadVT.getScalarType();
7259 EVT IntVT = LoadVT.changeTypeToInteger();
7260
7261 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7262
7263 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7264 bool IsTFE = M->getNumValues() == 3;
7265
7266 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7267 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7268 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7269 : AMDGPUISD::BUFFER_LOAD;
7270
7271 if (IsD16) {
7272 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7273 }
7274
7275 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7276 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7277 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7278 IsTFE);
7279
7280 if (isTypeLegal(LoadVT)) {
7281 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7282 M->getMemOperand(), DAG);
7283 }
7284
7285 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7286 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7287 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7288 M->getMemOperand(), DAG);
7289 return DAG.getMergeValues(
7290 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7291 DL);
7292}
7293
7295 SelectionDAG &DAG) {
7296 EVT VT = N->getValueType(0);
7297 unsigned CondCode = N->getConstantOperandVal(3);
7298 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7299 return DAG.getPOISON(VT);
7300
7301 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7302
7303 SDValue LHS = N->getOperand(1);
7304 SDValue RHS = N->getOperand(2);
7305
7306 SDLoc DL(N);
7307
7308 EVT CmpVT = LHS.getValueType();
7309 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7310 unsigned PromoteOp =
7312 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7313 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7314 }
7315
7316 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7317
7318 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7319 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7320
7321 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7322 DAG.getCondCode(CCOpcode));
7323 if (VT.bitsEq(CCVT))
7324 return SetCC;
7325 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7326}
7327
7329 SelectionDAG &DAG) {
7330 EVT VT = N->getValueType(0);
7331
7332 unsigned CondCode = N->getConstantOperandVal(3);
7333 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7334 return DAG.getPOISON(VT);
7335
7336 SDValue Src0 = N->getOperand(1);
7337 SDValue Src1 = N->getOperand(2);
7338 EVT CmpVT = Src0.getValueType();
7339 SDLoc SL(N);
7340
7341 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7342 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7343 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7344 }
7345
7346 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7347 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7348 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7349 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7350 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7351 DAG.getCondCode(CCOpcode));
7352 if (VT.bitsEq(CCVT))
7353 return SetCC;
7354 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7355}
7356
7358 SelectionDAG &DAG) {
7359 EVT VT = N->getValueType(0);
7360 SDValue Src = N->getOperand(1);
7361 SDLoc SL(N);
7362
7363 if (Src.getOpcode() == ISD::SETCC) {
7364 SDValue Op0 = Src.getOperand(0);
7365 SDValue Op1 = Src.getOperand(1);
7366 // Need to expand bfloat to float for comparison (setcc).
7367 if (Op0.getValueType() == MVT::bf16) {
7368 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7369 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7370 }
7371 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7372 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7373 }
7374 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7375 // (ballot 0) -> 0
7376 if (Arg->isZero())
7377 return DAG.getConstant(0, SL, VT);
7378
7379 // (ballot 1) -> EXEC/EXEC_LO
7380 if (Arg->isOne()) {
7381 Register Exec;
7382 if (VT.getScalarSizeInBits() == 32)
7383 Exec = AMDGPU::EXEC_LO;
7384 else if (VT.getScalarSizeInBits() == 64)
7385 Exec = AMDGPU::EXEC;
7386 else
7387 return SDValue();
7388
7389 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7390 }
7391 }
7392
7393 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7394 // ISD::SETNE)
7395 return DAG.getNode(
7396 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7397 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7398}
7399
7401 SelectionDAG &DAG) {
7402 EVT VT = N->getValueType(0);
7403 unsigned ValSize = VT.getSizeInBits();
7404 unsigned IID = N->getConstantOperandVal(0);
7405 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7406 IID == Intrinsic::amdgcn_permlanex16;
7407 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7408 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7409 SDLoc SL(N);
7410 MVT IntVT = MVT::getIntegerVT(ValSize);
7411 const GCNSubtarget *ST = TLI.getSubtarget();
7412 unsigned SplitSize = 32;
7413 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7414 ST->hasDPALU_DPP() &&
7415 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7416 SplitSize = 64;
7417
7418 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7419 SDValue Src2, MVT ValT) -> SDValue {
7420 SmallVector<SDValue, 8> Operands;
7421 switch (IID) {
7422 case Intrinsic::amdgcn_permlane16:
7423 case Intrinsic::amdgcn_permlanex16:
7424 case Intrinsic::amdgcn_update_dpp:
7425 Operands.push_back(N->getOperand(6));
7426 Operands.push_back(N->getOperand(5));
7427 Operands.push_back(N->getOperand(4));
7428 [[fallthrough]];
7429 case Intrinsic::amdgcn_writelane:
7430 Operands.push_back(Src2);
7431 [[fallthrough]];
7432 case Intrinsic::amdgcn_readlane:
7433 case Intrinsic::amdgcn_set_inactive:
7434 case Intrinsic::amdgcn_set_inactive_chain_arg:
7435 case Intrinsic::amdgcn_mov_dpp8:
7436 Operands.push_back(Src1);
7437 [[fallthrough]];
7438 case Intrinsic::amdgcn_readfirstlane:
7439 case Intrinsic::amdgcn_permlane64:
7440 Operands.push_back(Src0);
7441 break;
7442 default:
7443 llvm_unreachable("unhandled lane op");
7444 }
7445
7446 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7447 std::reverse(Operands.begin(), Operands.end());
7448
7449 if (SDNode *GL = N->getGluedNode()) {
7450 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7451 GL = GL->getOperand(0).getNode();
7452 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7453 SDValue(GL, 0)));
7454 }
7455
7456 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7457 };
7458
7459 SDValue Src0 = N->getOperand(1);
7460 SDValue Src1, Src2;
7461 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7462 IID == Intrinsic::amdgcn_mov_dpp8 ||
7463 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7464 Src1 = N->getOperand(2);
7465 if (IID == Intrinsic::amdgcn_writelane ||
7466 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7467 Src2 = N->getOperand(3);
7468 }
7469
7470 if (ValSize == SplitSize) {
7471 // Already legal
7472 return SDValue();
7473 }
7474
7475 if (ValSize < 32) {
7476 bool IsFloat = VT.isFloatingPoint();
7477 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7478 SL, MVT::i32);
7479
7480 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7481 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7482 SL, MVT::i32);
7483 }
7484
7485 if (IID == Intrinsic::amdgcn_writelane) {
7486 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7487 SL, MVT::i32);
7488 }
7489
7490 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7491 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7492 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7493 }
7494
7495 if (ValSize % SplitSize != 0)
7496 return SDValue();
7497
7498 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7499 EVT VT = N->getValueType(0);
7500 unsigned NE = VT.getVectorNumElements();
7501 EVT EltVT = VT.getVectorElementType();
7503 unsigned NumOperands = N->getNumOperands();
7504 SmallVector<SDValue, 4> Operands(NumOperands);
7505 SDNode *GL = N->getGluedNode();
7506
7507 // only handle convergencectrl_glue
7509
7510 for (unsigned i = 0; i != NE; ++i) {
7511 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7512 ++j) {
7513 SDValue Operand = N->getOperand(j);
7514 EVT OperandVT = Operand.getValueType();
7515 if (OperandVT.isVector()) {
7516 // A vector operand; extract a single element.
7517 EVT OperandEltVT = OperandVT.getVectorElementType();
7518 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7519 Operand, DAG.getVectorIdxConstant(i, SL));
7520 } else {
7521 // A scalar operand; just use it as is.
7522 Operands[j] = Operand;
7523 }
7524 }
7525
7526 if (GL)
7527 Operands[NumOperands - 1] =
7528 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7529 SDValue(GL->getOperand(0).getNode(), 0));
7530
7531 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7532 }
7533
7534 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7535 return DAG.getBuildVector(VecVT, SL, Scalars);
7536 };
7537
7538 if (VT.isVector()) {
7539 switch (MVT::SimpleValueType EltTy =
7541 case MVT::i32:
7542 case MVT::f32:
7543 if (SplitSize == 32) {
7544 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7545 return unrollLaneOp(LaneOp.getNode());
7546 }
7547 [[fallthrough]];
7548 case MVT::i16:
7549 case MVT::f16:
7550 case MVT::bf16: {
7551 unsigned SubVecNumElt =
7552 SplitSize / VT.getVectorElementType().getSizeInBits();
7553 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7555 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7556 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7557 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7558 DAG.getConstant(EltIdx, SL, MVT::i32));
7559
7560 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7561 IsPermLane16)
7562 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7563 DAG.getConstant(EltIdx, SL, MVT::i32));
7564
7565 if (IID == Intrinsic::amdgcn_writelane)
7566 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7567 DAG.getConstant(EltIdx, SL, MVT::i32));
7568
7569 Pieces.push_back(
7570 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7571 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7572 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7573 EltIdx += SubVecNumElt;
7574 }
7575 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7576 }
7577 default:
7578 // Handle all other cases by bitcasting to i32 vectors
7579 break;
7580 }
7581 }
7582
7583 MVT VecVT =
7584 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7585 Src0 = DAG.getBitcast(VecVT, Src0);
7586
7587 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7588 Src1 = DAG.getBitcast(VecVT, Src1);
7589
7590 if (IID == Intrinsic::amdgcn_writelane)
7591 Src2 = DAG.getBitcast(VecVT, Src2);
7592
7593 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7594 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7595 return DAG.getBitcast(VT, UnrolledLaneOp);
7596}
7597
7599 SelectionDAG &DAG) {
7600 EVT VT = N->getValueType(0);
7601
7602 if (VT.getSizeInBits() != 32)
7603 return SDValue();
7604
7605 SDLoc SL(N);
7606
7607 SDValue Value = N->getOperand(1);
7608 SDValue Index = N->getOperand(2);
7609
7610 // ds_bpermute requires index to be multiplied by 4
7611 SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
7612 SDValue ShiftedIndex =
7613 DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
7614
7615 // Intrinsics will require i32 to operate on
7616 SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
7617
7618 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7619 SmallVector<SDValue> IntrinArgs) -> SDValue {
7620 SmallVector<SDValue> Operands(1);
7621 Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
7622 Operands.append(IntrinArgs);
7623 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
7624 };
7625
7626 // If we can bpermute across the whole wave, then just do that
7628 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7629 {ShiftedIndex, ValueI32});
7630 return DAG.getBitcast(VT, BPermute);
7631 }
7632
7633 assert(TLI.getSubtarget()->isWave64());
7634
7635 // Otherwise, we need to make use of whole wave mode
7636 SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
7637
7638 // Set inactive lanes to poison
7639 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7640 {ValueI32, PoisonVal});
7641 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7642 {ShiftedIndex, PoisonVal});
7643
7644 SDValue Swapped =
7645 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7646
7647 // Get permutation of each half, then we'll select which one to use
7648 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7649 {WWMIndex, WWMValue});
7650 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7651 MVT::i32, {WWMIndex, Swapped});
7652 SDValue BPermOtherHalfWWM =
7653 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7654
7655 // Select which side to take the permute from
7656 SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
7657 // We can get away with only using mbcnt_lo here since we're only
7658 // trying to detect which side of 32 each lane is on, and mbcnt_lo
7659 // returns 32 for lanes 32-63.
7660 SDValue ThreadID =
7661 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7662 {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7663
7664 SDValue SameOrOtherHalf =
7665 DAG.getNode(ISD::AND, SL, MVT::i32,
7666 DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7667 DAG.getTargetConstant(32, SL, MVT::i32));
7668 SDValue UseSameHalf =
7669 DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7670 DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7671 SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7672 BPermOtherHalfWWM);
7673 return DAG.getBitcast(VT, Result);
7674}
7675
7678 SelectionDAG &DAG) const {
7679 switch (N->getOpcode()) {
7681 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7682 Results.push_back(Res);
7683 return;
7684 }
7686 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7687 Results.push_back(Res);
7688 return;
7689 }
7691 unsigned IID = N->getConstantOperandVal(0);
7692 switch (IID) {
7693 case Intrinsic::amdgcn_make_buffer_rsrc:
7694 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7695 return;
7696 case Intrinsic::amdgcn_cvt_pkrtz: {
7697 SDValue Src0 = N->getOperand(1);
7698 SDValue Src1 = N->getOperand(2);
7699 SDLoc SL(N);
7700 SDValue Cvt =
7701 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7702 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7703 return;
7704 }
7705 case Intrinsic::amdgcn_cvt_pknorm_i16:
7706 case Intrinsic::amdgcn_cvt_pknorm_u16:
7707 case Intrinsic::amdgcn_cvt_pk_i16:
7708 case Intrinsic::amdgcn_cvt_pk_u16: {
7709 SDValue Src0 = N->getOperand(1);
7710 SDValue Src1 = N->getOperand(2);
7711 SDLoc SL(N);
7712 unsigned Opcode;
7713
7714 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7715 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7716 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7717 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7718 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7719 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7720 else
7721 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7722
7723 EVT VT = N->getValueType(0);
7724 if (isTypeLegal(VT))
7725 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7726 else {
7727 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7728 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7729 }
7730 return;
7731 }
7732 case Intrinsic::amdgcn_s_buffer_load: {
7733 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7734 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7735 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7736 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7737 // s_buffer_load_i8.
7738 if (!Subtarget->hasScalarSubwordLoads())
7739 return;
7740 SDValue Op = SDValue(N, 0);
7741 SDValue Rsrc = Op.getOperand(1);
7742 SDValue Offset = Op.getOperand(2);
7743 SDValue CachePolicy = Op.getOperand(3);
7744 EVT VT = Op.getValueType();
7745 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7746 SDLoc DL(Op);
7748 const DataLayout &DataLayout = DAG.getDataLayout();
7749 Align Alignment =
7755 VT.getStoreSize(), Alignment);
7756 SDValue LoadVal;
7757 if (!Offset->isDivergent()) {
7758 SDValue Ops[] = {Rsrc, // source register
7759 Offset, CachePolicy};
7760 SDValue BufferLoad =
7761 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7762 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7763 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7764 } else {
7765 SDValue Ops[] = {
7766 DAG.getEntryNode(), // Chain
7767 Rsrc, // rsrc
7768 DAG.getConstant(0, DL, MVT::i32), // vindex
7769 {}, // voffset
7770 {}, // soffset
7771 {}, // offset
7772 CachePolicy, // cachepolicy
7773 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7774 };
7775 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7776 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7777 }
7778 Results.push_back(LoadVal);
7779 return;
7780 }
7781 case Intrinsic::amdgcn_dead: {
7782 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7783 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7784 return;
7785 }
7786 }
7787 break;
7788 }
7790 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7791 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7792 // FIXME: Hacky
7793 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7794 Results.push_back(Res.getOperand(I));
7795 }
7796 } else {
7797 Results.push_back(Res);
7798 Results.push_back(Res.getValue(1));
7799 }
7800 return;
7801 }
7802
7803 break;
7804 }
7805 case ISD::SELECT: {
7806 SDLoc SL(N);
7807 EVT VT = N->getValueType(0);
7808 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7809 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7810 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7811
7812 EVT SelectVT = NewVT;
7813 if (NewVT.bitsLT(MVT::i32)) {
7814 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7815 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7816 SelectVT = MVT::i32;
7817 }
7818
7819 SDValue NewSelect =
7820 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7821
7822 if (NewVT != SelectVT)
7823 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7824 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7825 return;
7826 }
7827 case ISD::FNEG: {
7828 if (N->getValueType(0) != MVT::v2f16)
7829 break;
7830
7831 SDLoc SL(N);
7832 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7833
7834 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7835 DAG.getConstant(0x80008000, SL, MVT::i32));
7836 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7837 return;
7838 }
7839 case ISD::FABS: {
7840 if (N->getValueType(0) != MVT::v2f16)
7841 break;
7842
7843 SDLoc SL(N);
7844 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7845
7846 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7847 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7848 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7849 return;
7850 }
7851 case ISD::FSQRT: {
7852 if (N->getValueType(0) != MVT::f16)
7853 break;
7854 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7855 break;
7856 }
7857 default:
7859 break;
7860 }
7861}
7862
7863/// Helper function for LowerBRCOND
7864static SDNode *findUser(SDValue Value, unsigned Opcode) {
7865
7866 for (SDUse &U : Value->uses()) {
7867 if (U.get() != Value)
7868 continue;
7869
7870 if (U.getUser()->getOpcode() == Opcode)
7871 return U.getUser();
7872 }
7873 return nullptr;
7874}
7875
7876unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7877 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7878 switch (Intr->getConstantOperandVal(1)) {
7879 case Intrinsic::amdgcn_if:
7880 return AMDGPUISD::IF;
7881 case Intrinsic::amdgcn_else:
7882 return AMDGPUISD::ELSE;
7883 case Intrinsic::amdgcn_loop:
7884 return AMDGPUISD::LOOP;
7885 case Intrinsic::amdgcn_end_cf:
7886 llvm_unreachable("should not occur");
7887 default:
7888 return 0;
7889 }
7890 }
7891
7892 // break, if_break, else_break are all only used as inputs to loop, not
7893 // directly as branch conditions.
7894 return 0;
7895}
7896
7903
7905 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7906 return false;
7907
7908 // FIXME: Either avoid relying on address space here or change the default
7909 // address space for functions to avoid the explicit check.
7910 return (GV->getValueType()->isFunctionTy() ||
7913}
7914
7916 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7917}
7918
7920 if (!GV->hasExternalLinkage())
7921 return true;
7922
7923 const auto OS = getTargetMachine().getTargetTriple().getOS();
7924 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7925}
7926
7927/// This transforms the control flow intrinsics to get the branch destination as
7928/// last parameter, also switches branch target with BR if the need arise
7929SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7930 SDLoc DL(BRCOND);
7931
7932 SDNode *Intr = BRCOND.getOperand(1).getNode();
7933 SDValue Target = BRCOND.getOperand(2);
7934 SDNode *BR = nullptr;
7935 SDNode *SetCC = nullptr;
7936
7937 switch (Intr->getOpcode()) {
7938 case ISD::SETCC: {
7939 // As long as we negate the condition everything is fine
7940 SetCC = Intr;
7941 Intr = SetCC->getOperand(0).getNode();
7942 break;
7943 }
7944 case ISD::XOR: {
7945 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7946 SDValue LHS = Intr->getOperand(0);
7947 SDValue RHS = Intr->getOperand(1);
7948 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7949 Intr = LHS.getNode();
7950 break;
7951 }
7952 [[fallthrough]];
7953 }
7954 default: {
7955 // Get the target from BR if we don't negate the condition
7956 BR = findUser(BRCOND, ISD::BR);
7957 assert(BR && "brcond missing unconditional branch user");
7958 Target = BR->getOperand(1);
7959 }
7960 }
7961
7962 unsigned CFNode = isCFIntrinsic(Intr);
7963 if (CFNode == 0) {
7964 // This is a uniform branch so we don't need to legalize.
7965 return BRCOND;
7966 }
7967
7968 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7970
7971 assert(!SetCC ||
7972 (SetCC->getConstantOperandVal(1) == 1 &&
7973 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7974 ISD::SETNE));
7975
7976 // operands of the new intrinsic call
7978 if (HaveChain)
7979 Ops.push_back(BRCOND.getOperand(0));
7980
7981 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7982 Ops.push_back(Target);
7983
7984 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7985
7986 // build the new intrinsic call
7987 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7988
7989 if (!HaveChain) {
7990 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7991
7993 }
7994
7995 if (BR) {
7996 // Give the branch instruction our target
7997 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7998 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7999 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
8000 }
8001
8002 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
8003
8004 // Copy the intrinsic results to registers
8005 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
8006 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
8007 if (!CopyToReg)
8008 continue;
8009
8010 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
8011 SDValue(Result, i - 1), SDValue());
8012
8013 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
8014 }
8015
8016 // Remove the old intrinsic from the chain
8017 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
8018 Intr->getOperand(0));
8019
8020 return Chain;
8021}
8022
8023SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
8024 MVT VT = Op.getSimpleValueType();
8025 SDLoc DL(Op);
8026 // Checking the depth
8027 if (Op.getConstantOperandVal(0) != 0)
8028 return DAG.getConstant(0, DL, VT);
8029
8030 MachineFunction &MF = DAG.getMachineFunction();
8031 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8032 // Check for kernel and shader functions
8033 if (Info->isEntryFunction())
8034 return DAG.getConstant(0, DL, VT);
8035
8036 MachineFrameInfo &MFI = MF.getFrameInfo();
8037 // There is a call to @llvm.returnaddress in this function
8038 MFI.setReturnAddressIsTaken(true);
8039
8040 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
8041 // Get the return address reg and mark it as an implicit live-in
8042 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
8043 getRegClassFor(VT, Op.getNode()->isDivergent()));
8044
8045 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
8046}
8047
8048SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8049 MachineFunction &MF = DAG.getMachineFunction();
8050 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8051
8052 // For functions that set up their own stack, select the GET_STACK_BASE
8053 // pseudo.
8054 if (MFI->isBottomOfStack())
8055 return Op;
8056
8057 // For everything else, create a dummy stack object.
8058 int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);
8059 return DAG.getFrameIndex(FI, Op.getValueType());
8060}
8061
8062SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8063 const SDLoc &DL, EVT VT) const {
8064 return Op.getValueType().bitsLE(VT)
8065 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
8066 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
8067 DAG.getTargetConstant(0, DL, MVT::i32));
8068}
8069
8070SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8071 SelectionDAG &DAG) const {
8072 EVT DstVT = Op.getValueType();
8073 unsigned NumElts = DstVT.getVectorNumElements();
8074 assert(NumElts > 2 && isPowerOf2_32(NumElts));
8075
8076 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
8077
8078 SDLoc DL(Op);
8079 unsigned Opc = Op.getOpcode();
8080 SDValue Flags = Op.getOperand(1);
8081 EVT HalfDstVT =
8082 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
8083 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
8084 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
8085
8086 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
8087}
8088
8089SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8090 SDValue Src = Op.getOperand(0);
8091 EVT SrcVT = Src.getValueType();
8092 EVT DstVT = Op.getValueType();
8093
8094 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
8095 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8096 if (SrcVT.getScalarType() != MVT::f32)
8097 return SDValue();
8098 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8099 }
8100
8101 if (SrcVT.getScalarType() != MVT::f64)
8102 return Op;
8103
8104 SDLoc DL(Op);
8105 if (DstVT == MVT::f16) {
8106 // TODO: Handle strictfp
8107 if (Op.getOpcode() != ISD::FP_ROUND)
8108 return Op;
8109
8110 if (!Subtarget->has16BitInsts()) {
8111 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
8112 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8113 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8114 }
8115 if (Op->getFlags().hasApproximateFuncs()) {
8116 SDValue Flags = Op.getOperand(1);
8117 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
8118 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
8119 }
8120 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8121 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8122 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8123 }
8124
8125 assert(DstVT.getScalarType() == MVT::bf16 &&
8126 "custom lower FP_ROUND for f16 or bf16");
8127 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8128
8129 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8130 // hardware f32 -> bf16 instruction.
8131 EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
8132 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
8133 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
8134 DAG.getTargetConstant(0, DL, MVT::i32));
8135}
8136
8137SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8138 SelectionDAG &DAG) const {
8139 EVT VT = Op.getValueType();
8140 const MachineFunction &MF = DAG.getMachineFunction();
8141 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8142 bool IsIEEEMode = Info->getMode().IEEE;
8143
8144 // FIXME: Assert during selection that this is only selected for
8145 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8146 // mode functions, but this happens to be OK since it's only done in cases
8147 // where there is known no sNaN.
8148 if (IsIEEEMode)
8149 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
8150
8151 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8152 VT == MVT::v16bf16)
8153 return splitBinaryVectorOp(Op, DAG);
8154 return Op;
8155}
8156
8157SDValue
8158SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8159 SelectionDAG &DAG) const {
8160 EVT VT = Op.getValueType();
8161 const MachineFunction &MF = DAG.getMachineFunction();
8162 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8163 bool IsIEEEMode = Info->getMode().IEEE;
8164
8165 if (IsIEEEMode)
8166 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
8167
8168 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8169 VT == MVT::v16bf16)
8170 return splitBinaryVectorOp(Op, DAG);
8171 return Op;
8172}
8173
8174SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8175 SelectionDAG &DAG) const {
8176 EVT VT = Op.getValueType();
8177 if (VT.isVector())
8178 return splitBinaryVectorOp(Op, DAG);
8179
8180 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8181 !Subtarget->hasMinimum3Maximum3F16() &&
8182 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8183 "should not need to widen f16 minimum/maximum to v2f16");
8184
8185 // Widen f16 operation to v2f16
8186
8187 // fminimum f16:x, f16:y ->
8188 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8189 // (v2f16 (scalar_to_vector y))), 0
8190 SDLoc SL(Op);
8191 SDValue WideSrc0 =
8192 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
8193 SDValue WideSrc1 =
8194 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
8195
8196 SDValue Widened =
8197 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8198
8199 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
8200 DAG.getConstant(0, SL, MVT::i32));
8201}
8202
8203SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8204 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8205 EVT VT = Op.getValueType();
8206 assert(VT == MVT::f16);
8207
8208 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
8209 EVT ExpVT = Exp.getValueType();
8210 if (ExpVT == MVT::i16)
8211 return Op;
8212
8213 SDLoc DL(Op);
8214
8215 // Correct the exponent type for f16 to i16.
8216 // Clamp the range of the exponent to the instruction's range.
8217
8218 // TODO: This should be a generic narrowing legalization, and can easily be
8219 // for GlobalISel.
8220
8221 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
8222 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
8223
8224 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
8225 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
8226
8227 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
8228
8229 if (IsStrict) {
8230 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
8231 {Op.getOperand(0), Op.getOperand(1), TruncExp});
8232 }
8233
8234 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
8235}
8236
8238 switch (Op->getOpcode()) {
8239 case ISD::SRA:
8240 case ISD::SMIN:
8241 case ISD::SMAX:
8242 return ISD::SIGN_EXTEND;
8243 case ISD::SRL:
8244 case ISD::UMIN:
8245 case ISD::UMAX:
8246 return ISD::ZERO_EXTEND;
8247 case ISD::ADD:
8248 case ISD::SUB:
8249 case ISD::AND:
8250 case ISD::OR:
8251 case ISD::XOR:
8252 case ISD::SHL:
8253 case ISD::SELECT:
8254 case ISD::MUL:
8255 // operation result won't be influenced by garbage high bits.
8256 // TODO: are all of those cases correct, and are there more?
8257 return ISD::ANY_EXTEND;
8258 case ISD::SETCC: {
8259 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8261 }
8262 default:
8263 llvm_unreachable("unexpected opcode!");
8264 }
8265}
8266
8267SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8268 DAGCombinerInfo &DCI) const {
8269 const unsigned Opc = Op.getOpcode();
8270 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8271 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8272 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8273 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8274 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8275
8276 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8277 : Op->getOperand(0).getValueType();
8278 auto &DAG = DCI.DAG;
8279 auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8280
8281 if (DCI.isBeforeLegalizeOps() ||
8282 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8283 return SDValue();
8284
8285 SDLoc DL(Op);
8286 SDValue LHS;
8287 SDValue RHS;
8288 if (Opc == ISD::SELECT) {
8289 LHS = Op->getOperand(1);
8290 RHS = Op->getOperand(2);
8291 } else {
8292 LHS = Op->getOperand(0);
8293 RHS = Op->getOperand(1);
8294 }
8295
8296 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8297 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
8298
8299 // Special case: for shifts, the RHS always needs a zext.
8300 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8301 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
8302 else
8303 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
8304
8305 // setcc always return i1/i1 vec so no need to truncate after.
8306 if (Opc == ISD::SETCC) {
8307 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8308 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8309 }
8310
8311 // For other ops, we extend the operation's return type as well so we need to
8312 // truncate back to the original type.
8313 SDValue NewVal;
8314 if (Opc == ISD::SELECT)
8315 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8316 else
8317 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8318
8319 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8320}
8321
8322SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8323 SDValue Mag = Op.getOperand(0);
8324 EVT MagVT = Mag.getValueType();
8325
8326 if (MagVT.getVectorNumElements() > 2)
8327 return splitBinaryVectorOp(Op, DAG);
8328
8329 SDValue Sign = Op.getOperand(1);
8330 EVT SignVT = Sign.getValueType();
8331
8332 if (MagVT == SignVT)
8333 return Op;
8334
8335 // fcopysign v2f16:mag, v2f32:sign ->
8336 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8337
8338 SDLoc SL(Op);
8339 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8340 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8341
8342 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8343
8344 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8345}
8346
8347// Custom lowering for vector multiplications and s_mul_u64.
8348SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8349 EVT VT = Op.getValueType();
8350
8351 // Split vector operands.
8352 if (VT.isVector())
8353 return splitBinaryVectorOp(Op, DAG);
8354
8355 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8356
8357 // There are four ways to lower s_mul_u64:
8358 //
8359 // 1. If all the operands are uniform, then we lower it as it is.
8360 //
8361 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8362 // multiplications because there is not a vector equivalent of s_mul_u64.
8363 //
8364 // 3. If the cost model decides that it is more efficient to use vector
8365 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8366 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8367 //
8368 // 4. If the cost model decides to use vector registers and both of the
8369 // operands are zero-extended/sign-extended from 32-bits, then we split the
8370 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8371 // possible to check if the operands are zero-extended or sign-extended in
8372 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8373 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8374 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8375 // If the cost model decides that we have to use vector registers, then
8376 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8377 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8378 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8379 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8380 // SIInstrInfo.cpp .
8381
8382 if (Op->isDivergent())
8383 return SDValue();
8384
8385 SDValue Op0 = Op.getOperand(0);
8386 SDValue Op1 = Op.getOperand(1);
8387 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8388 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8389 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8390 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8391 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8392 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8393 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8394 SDLoc SL(Op);
8395 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8396 return SDValue(
8397 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8398 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8399 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8400 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8401 return SDValue(
8402 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8403 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8404 return Op;
8405}
8406
8407SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8408 EVT VT = Op.getValueType();
8409 SDLoc SL(Op);
8410 SDValue LHS = Op.getOperand(0);
8411 SDValue RHS = Op.getOperand(1);
8412 bool isSigned = Op.getOpcode() == ISD::SMULO;
8413
8414 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8415 const APInt &C = RHSC->getAPIntValue();
8416 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8417 if (C.isPowerOf2()) {
8418 // smulo(x, signed_min) is same as umulo(x, signed_min).
8419 bool UseArithShift = isSigned && !C.isMinSignedValue();
8420 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8421 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8422 SDValue Overflow =
8423 DAG.getSetCC(SL, MVT::i1,
8424 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8425 Result, ShiftAmt),
8426 LHS, ISD::SETNE);
8427 return DAG.getMergeValues({Result, Overflow}, SL);
8428 }
8429 }
8430
8431 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8432 SDValue Top =
8433 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8434
8435 SDValue Sign = isSigned
8436 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8437 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8438 SL, MVT::i32))
8439 : DAG.getConstant(0, SL, VT);
8440 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8441
8442 return DAG.getMergeValues({Result, Overflow}, SL);
8443}
8444
8445SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8446 if (Op->isDivergent()) {
8447 // Select to V_MAD_[IU]64_[IU]32.
8448 return Op;
8449 }
8450 if (Subtarget->hasSMulHi()) {
8451 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8452 return SDValue();
8453 }
8454 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8455 // calculate the high part, so we might as well do the whole thing with
8456 // V_MAD_[IU]64_[IU]32.
8457 return Op;
8458}
8459
8460SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8461 if (!Subtarget->hasTrapHandler() ||
8462 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8463 return lowerTrapEndpgm(Op, DAG);
8464
8465 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8466 : lowerTrapHsaQueuePtr(Op, DAG);
8467}
8468
8469SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8470 SDLoc SL(Op);
8471 SDValue Chain = Op.getOperand(0);
8472 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8473}
8474
8475SDValue
8476SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8477 const SDLoc &DL, Align Alignment,
8478 ImplicitParameter Param) const {
8479 MachineFunction &MF = DAG.getMachineFunction();
8480 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8481 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8482 MachinePointerInfo PtrInfo =
8484 return DAG.getLoad(
8485 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8487}
8488
8489SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8490 SelectionDAG &DAG) const {
8491 SDLoc SL(Op);
8492 SDValue Chain = Op.getOperand(0);
8493
8494 SDValue QueuePtr;
8495 // For code object version 5, QueuePtr is passed through implicit kernarg.
8496 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8498 QueuePtr =
8499 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8500 } else {
8501 MachineFunction &MF = DAG.getMachineFunction();
8502 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8503 Register UserSGPR = Info->getQueuePtrUserSGPR();
8504
8505 if (UserSGPR == AMDGPU::NoRegister) {
8506 // We probably are in a function incorrectly marked with
8507 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8508 // trap, so just use a null pointer.
8509 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8510 } else {
8511 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8512 MVT::i64);
8513 }
8514 }
8515
8516 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8517 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8518
8519 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8520 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8521 ToReg.getValue(1)};
8522 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8523}
8524
8525SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8526 SDLoc SL(Op);
8527 SDValue Chain = Op.getOperand(0);
8528
8529 // We need to simulate the 's_trap 2' instruction on targets that run in
8530 // PRIV=1 (where it is treated as a nop).
8531 if (Subtarget->hasPrivEnabledTrap2NopBug())
8532 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8533
8534 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8535 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8536 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8537}
8538
8539SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8540 SDLoc SL(Op);
8541 SDValue Chain = Op.getOperand(0);
8542 MachineFunction &MF = DAG.getMachineFunction();
8543
8544 if (!Subtarget->hasTrapHandler() ||
8545 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8546 LLVMContext &Ctx = MF.getFunction().getContext();
8547 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8548 "debugtrap handler not supported",
8549 Op.getDebugLoc(), DS_Warning));
8550 return Chain;
8551 }
8552
8553 uint64_t TrapID =
8554 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8555 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8556 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8557}
8558
8559SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8560 SelectionDAG &DAG) const {
8561 if (Subtarget->hasApertureRegs()) {
8562 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8563 ? AMDGPU::SRC_SHARED_BASE
8564 : AMDGPU::SRC_PRIVATE_BASE;
8565 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8566 !Subtarget->hasGloballyAddressableScratch()) &&
8567 "Cannot use src_private_base with globally addressable scratch!");
8568 // Note: this feature (register) is broken. When used as a 32-bit operand,
8569 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8570 // bits.
8571 //
8572 // To work around the issue, emit a 64 bit copy from this register
8573 // then extract the high bits. Note that this shouldn't even result in a
8574 // shift being emitted and simply become a pair of registers (e.g.):
8575 // s_mov_b64 s[6:7], src_shared_base
8576 // v_mov_b32_e32 v1, s7
8577 SDValue Copy =
8578 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8579 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8580 }
8581
8582 // For code object version 5, private_base and shared_base are passed through
8583 // implicit kernargs.
8584 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8588 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8589 }
8590
8591 MachineFunction &MF = DAG.getMachineFunction();
8592 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8593 Register UserSGPR = Info->getQueuePtrUserSGPR();
8594 if (UserSGPR == AMDGPU::NoRegister) {
8595 // We probably are in a function incorrectly marked with
8596 // amdgpu-no-queue-ptr. This is undefined.
8597 return DAG.getPOISON(MVT::i32);
8598 }
8599
8600 SDValue QueuePtr =
8601 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8602
8603 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8604 // private_segment_aperture_base_hi.
8605 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8606
8607 SDValue Ptr =
8608 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8609
8610 // TODO: Use custom target PseudoSourceValue.
8611 // TODO: We should use the value from the IR intrinsic call, but it might not
8612 // be available and how do we get it?
8613 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8614 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8615 commonAlignment(Align(64), StructOffset),
8618}
8619
8620/// Return true if the value is a known valid address, such that a null check is
8621/// not necessary.
8623 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8625 return true;
8626
8627 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8628 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8629
8630 // TODO: Search through arithmetic, handle arguments and loads
8631 // marked nonnull.
8632 return false;
8633}
8634
8635SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8636 SelectionDAG &DAG) const {
8637 SDLoc SL(Op);
8638
8639 const AMDGPUTargetMachine &TM =
8640 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8641
8642 unsigned DestAS, SrcAS;
8643 SDValue Src;
8644 bool IsNonNull = false;
8645 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8646 SrcAS = ASC->getSrcAddressSpace();
8647 Src = ASC->getOperand(0);
8648 DestAS = ASC->getDestAddressSpace();
8649 } else {
8650 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8651 Op.getConstantOperandVal(0) ==
8652 Intrinsic::amdgcn_addrspacecast_nonnull);
8653 Src = Op->getOperand(1);
8654 SrcAS = Op->getConstantOperandVal(2);
8655 DestAS = Op->getConstantOperandVal(3);
8656 IsNonNull = true;
8657 }
8658
8659 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8660
8661 // flat -> local/private
8662 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8663 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8664 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8665 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8666
8667 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8668 Subtarget->hasGloballyAddressableScratch()) {
8669 // flat -> private with globally addressable scratch: subtract
8670 // src_flat_scratch_base_lo.
8671 SDValue FlatScratchBaseLo(
8672 DAG.getMachineNode(
8673 AMDGPU::S_MOV_B32, SL, MVT::i32,
8674 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8675 0);
8676 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8677 }
8678
8679 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8680 return Ptr;
8681
8682 unsigned NullVal = TM.getNullPointerValue(DestAS);
8683 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8684 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8685
8686 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8687 SegmentNullPtr);
8688 }
8689 }
8690
8691 // local/private -> flat
8692 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8693 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8694 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8695 SDValue CvtPtr;
8696 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8697 Subtarget->hasGloballyAddressableScratch()) {
8698 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8699 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8700 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8701 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8702 ThreadID = DAG.getNode(
8703 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8704 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8705 AllOnes, ThreadID);
8706 if (Subtarget->isWave64())
8707 ThreadID = DAG.getNode(
8708 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8709 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8710 AllOnes, ThreadID);
8711 SDValue ShAmt = DAG.getShiftAmountConstant(
8712 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8713 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8714 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8715 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8716 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8717 // 64-bit hi:lo value.
8718 SDValue FlatScratchBase = {
8719 DAG.getMachineNode(
8720 AMDGPU::S_MOV_B64, SL, MVT::i64,
8721 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8722 0};
8723 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8724 } else {
8725 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8726 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8727 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8728 }
8729
8730 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8731 return CvtPtr;
8732
8733 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8734 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8735
8736 SDValue NonNull =
8737 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8738
8739 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8740 FlatNullPtr);
8741 }
8742 }
8743
8744 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8745 Op.getValueType() == MVT::i64) {
8746 const SIMachineFunctionInfo *Info =
8747 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8748 if (Info->get32BitAddressHighBits() == 0)
8749 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
8750
8751 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8752 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8753 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8754 }
8755
8756 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8757 Src.getValueType() == MVT::i64)
8758 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8759
8760 // global <-> flat are no-ops and never emitted.
8761
8762 // Invalid casts are poison.
8763 return DAG.getPOISON(Op->getValueType(0));
8764}
8765
8766// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8767// the small vector and inserting them into the big vector. That is better than
8768// the default expansion of doing it via a stack slot. Even though the use of
8769// the stack slot would be optimized away afterwards, the stack slot itself
8770// remains.
8771SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8772 SelectionDAG &DAG) const {
8773 SDValue Vec = Op.getOperand(0);
8774 SDValue Ins = Op.getOperand(1);
8775 SDValue Idx = Op.getOperand(2);
8776 EVT VecVT = Vec.getValueType();
8777 EVT InsVT = Ins.getValueType();
8778 EVT EltVT = VecVT.getVectorElementType();
8779 unsigned InsNumElts = InsVT.getVectorNumElements();
8780 unsigned IdxVal = Idx->getAsZExtVal();
8781 SDLoc SL(Op);
8782
8783 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8784 // Insert 32-bit registers at a time.
8785 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8786
8787 unsigned VecNumElts = VecVT.getVectorNumElements();
8788 EVT NewVecVT =
8789 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8790 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8792 MVT::i32, InsNumElts / 2);
8793
8794 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8795 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8796
8797 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8798 SDValue Elt;
8799 if (InsNumElts == 2) {
8800 Elt = Ins;
8801 } else {
8802 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8803 DAG.getConstant(I, SL, MVT::i32));
8804 }
8805 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8806 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8807 }
8808
8809 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8810 }
8811
8812 for (unsigned I = 0; I != InsNumElts; ++I) {
8813 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8814 DAG.getConstant(I, SL, MVT::i32));
8815 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8816 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8817 }
8818 return Vec;
8819}
8820
8821SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8822 SelectionDAG &DAG) const {
8823 SDValue Vec = Op.getOperand(0);
8824 SDValue InsVal = Op.getOperand(1);
8825 SDValue Idx = Op.getOperand(2);
8826 EVT VecVT = Vec.getValueType();
8827 EVT EltVT = VecVT.getVectorElementType();
8828 unsigned VecSize = VecVT.getSizeInBits();
8829 unsigned EltSize = EltVT.getSizeInBits();
8830 SDLoc SL(Op);
8831
8832 // Specially handle the case of v4i16 with static indexing.
8833 unsigned NumElts = VecVT.getVectorNumElements();
8834 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8835 if (NumElts == 4 && EltSize == 16 && KIdx) {
8836 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8837
8838 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8839 DAG.getConstant(0, SL, MVT::i32));
8840 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8841 DAG.getConstant(1, SL, MVT::i32));
8842
8843 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8844 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8845
8846 unsigned Idx = KIdx->getZExtValue();
8847 bool InsertLo = Idx < 2;
8848 SDValue InsHalf = DAG.getNode(
8849 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8850 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8851 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8852
8853 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8854
8855 SDValue Concat =
8856 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8857 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8858
8859 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8860 }
8861
8862 // Static indexing does not lower to stack access, and hence there is no need
8863 // for special custom lowering to avoid stack access.
8864 if (isa<ConstantSDNode>(Idx))
8865 return SDValue();
8866
8867 // Avoid stack access for dynamic indexing by custom lowering to
8868 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8869
8870 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8871
8872 MVT IntVT = MVT::getIntegerVT(VecSize);
8873
8874 // Convert vector index to bit-index and get the required bit mask.
8875 assert(isPowerOf2_32(EltSize));
8876 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8877 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8878 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8879 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8880 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8881
8882 // 1. Create a congruent vector with the target value in each element.
8883 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8884 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8885
8886 // 2. Mask off all other indices except the required index within (1).
8887 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8888
8889 // 3. Mask off the required index within the target vector.
8890 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8891 SDValue RHS =
8892 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8893
8894 // 4. Get (2) and (3) ORed into the target vector.
8895 SDValue BFI =
8896 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8897
8898 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8899}
8900
8901SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8902 SelectionDAG &DAG) const {
8903 SDLoc SL(Op);
8904
8905 EVT ResultVT = Op.getValueType();
8906 SDValue Vec = Op.getOperand(0);
8907 SDValue Idx = Op.getOperand(1);
8908 EVT VecVT = Vec.getValueType();
8909 unsigned VecSize = VecVT.getSizeInBits();
8910 EVT EltVT = VecVT.getVectorElementType();
8911
8912 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8913
8914 // Make sure we do any optimizations that will make it easier to fold
8915 // source modifiers before obscuring it with bit operations.
8916
8917 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8918 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8919 return Combined;
8920
8921 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8922 SDValue Lo, Hi;
8923 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8924
8925 if (VecSize == 128) {
8926 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8927 Lo = DAG.getBitcast(LoVT,
8928 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8929 DAG.getConstant(0, SL, MVT::i32)));
8930 Hi = DAG.getBitcast(HiVT,
8931 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8932 DAG.getConstant(1, SL, MVT::i32)));
8933 } else if (VecSize == 256) {
8934 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8935 SDValue Parts[4];
8936 for (unsigned P = 0; P < 4; ++P) {
8937 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8938 DAG.getConstant(P, SL, MVT::i32));
8939 }
8940
8941 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8942 Parts[0], Parts[1]));
8943 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8944 Parts[2], Parts[3]));
8945 } else {
8946 assert(VecSize == 512);
8947
8948 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8949 SDValue Parts[8];
8950 for (unsigned P = 0; P < 8; ++P) {
8951 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8952 DAG.getConstant(P, SL, MVT::i32));
8953 }
8954
8955 Lo = DAG.getBitcast(LoVT,
8956 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8957 Parts[0], Parts[1], Parts[2], Parts[3]));
8958 Hi = DAG.getBitcast(HiVT,
8959 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8960 Parts[4], Parts[5], Parts[6], Parts[7]));
8961 }
8962
8963 EVT IdxVT = Idx.getValueType();
8964 unsigned NElem = VecVT.getVectorNumElements();
8965 assert(isPowerOf2_32(NElem));
8966 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8967 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8968 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8969 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8970 }
8971
8972 assert(VecSize <= 64);
8973
8974 MVT IntVT = MVT::getIntegerVT(VecSize);
8975
8976 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8977 SDValue VecBC = peekThroughBitcasts(Vec);
8978 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8979 SDValue Src = VecBC.getOperand(0);
8980 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8981 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8982 }
8983
8984 unsigned EltSize = EltVT.getSizeInBits();
8985 assert(isPowerOf2_32(EltSize));
8986
8987 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8988
8989 // Convert vector index to bit-index (* EltSize)
8990 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8991
8992 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8993 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8994
8995 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8996 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8997 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8998 }
8999
9000 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
9001}
9002
9003static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
9004 assert(Elt % 2 == 0);
9005 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9006}
9007
9008static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
9009 assert(Elt % 2 == 0);
9010 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9011 !(Mask[Elt + 1] & 1);
9012}
9013
9014SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
9015 SelectionDAG &DAG) const {
9016 SDLoc SL(Op);
9017 EVT ResultVT = Op.getValueType();
9018 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
9019 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
9020 const int NewSrcNumElts = 2;
9021 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
9022 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
9023
9024 // Break up the shuffle into registers sized pieces.
9025 //
9026 // We're trying to form sub-shuffles that the register allocation pipeline
9027 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
9028 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
9029 // pair of copies into a consecutive register copy, so use the ordinary
9030 // extract_vector_elt lowering unless we can use the shuffle.
9031 //
9032 // TODO: This is a bit of hack, and we should probably always use
9033 // extract_subvector for the largest possible subvector we can (or at least
9034 // use it for PackVT aligned pieces). However we have worse support for
9035 // combines on them don't directly treat extract_subvector / insert_subvector
9036 // as legal. The DAG scheduler also ends up doing a worse job with the
9037 // extract_subvectors.
9038 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
9039
9040 // vector_shuffle <0,1,6,7> lhs, rhs
9041 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
9042 //
9043 // vector_shuffle <6,7,2,3> lhs, rhs
9044 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9045 //
9046 // vector_shuffle <6,7,0,1> lhs, rhs
9047 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9048
9049 // Avoid scalarizing when both halves are reading from consecutive elements.
9050
9051 // If we're treating 2 element shuffles as legal, also create odd-to-even
9052 // shuffles of neighboring pairs.
9053 //
9054 // vector_shuffle <3,2,7,6> lhs, rhs
9055 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9056 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
9057
9059 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
9060 if (ShouldUseConsecutiveExtract &&
9062 const int Idx = SVN->getMaskElt(I);
9063 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9064 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9065 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
9066 SVN->getOperand(VecIdx),
9067 DAG.getConstant(EltIdx, SL, MVT::i32));
9068 Pieces.push_back(SubVec);
9069 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
9071 int Idx0 = SVN->getMaskElt(I);
9072 int Idx1 = SVN->getMaskElt(I + 1);
9073
9074 SDValue SrcOp0 = SVN->getOperand(0);
9075 SDValue SrcOp1 = SrcOp0;
9076 if (Idx0 >= SrcNumElts) {
9077 SrcOp0 = SVN->getOperand(1);
9078 Idx0 -= SrcNumElts;
9079 }
9080
9081 if (Idx1 >= SrcNumElts) {
9082 SrcOp1 = SVN->getOperand(1);
9083 Idx1 -= SrcNumElts;
9084 }
9085
9086 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9087 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9088
9089 // Extract nearest even aligned piece.
9090 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
9091 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
9092 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
9093 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
9094
9095 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9096 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9097
9098 SDValue Result0 = SubVec0;
9099 SDValue Result1 = SubVec0;
9100
9101 if (SubVec0 != SubVec1) {
9102 NewMaskIdx1 += NewSrcNumElts;
9103 Result1 = SubVec1;
9104 } else {
9105 Result1 = DAG.getPOISON(PackVT);
9106 }
9107
9108 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
9109 {NewMaskIdx0, NewMaskIdx1});
9110 Pieces.push_back(Shuf);
9111 } else {
9112 const int Idx0 = SVN->getMaskElt(I);
9113 const int Idx1 = SVN->getMaskElt(I + 1);
9114 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9115 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9116 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9117 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9118
9119 SDValue Vec0 = SVN->getOperand(VecIdx0);
9120 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
9121 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
9122
9123 SDValue Vec1 = SVN->getOperand(VecIdx1);
9124 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
9125 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
9126 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
9127 }
9128 }
9129
9130 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
9131}
9132
9133SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9134 SelectionDAG &DAG) const {
9135 SDValue SVal = Op.getOperand(0);
9136 EVT ResultVT = Op.getValueType();
9137 EVT SValVT = SVal.getValueType();
9138 SDValue UndefVal = DAG.getPOISON(SValVT);
9139 SDLoc SL(Op);
9140
9142 VElts.push_back(SVal);
9143 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9144 VElts.push_back(UndefVal);
9145
9146 return DAG.getBuildVector(ResultVT, SL, VElts);
9147}
9148
9149SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9150 SelectionDAG &DAG) const {
9151 SDLoc SL(Op);
9152 EVT VT = Op.getValueType();
9153
9154 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9155 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9156
9157 SDValue Lo = Op.getOperand(0);
9158 SDValue Hi = Op.getOperand(1);
9159
9160 // Avoid adding defined bits with the zero_extend.
9161 if (Hi.isUndef()) {
9162 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9163 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
9164 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
9165 }
9166
9167 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
9168 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
9169
9170 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
9171 DAG.getConstant(16, SL, MVT::i32));
9172 if (Lo.isUndef())
9173 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
9174
9175 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9176 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
9177
9178 SDValue Or =
9179 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
9180 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
9181 }
9182
9183 // Split into 2-element chunks.
9184 const unsigned NumParts = VT.getVectorNumElements() / 2;
9185 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
9186 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
9187
9189 for (unsigned P = 0; P < NumParts; ++P) {
9190 SDValue Vec = DAG.getBuildVector(
9191 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
9192 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
9193 }
9194
9195 SDValue Blend =
9196 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
9197 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
9198}
9199
9201 const GlobalAddressSDNode *GA) const {
9202 // OSes that use ELF REL relocations (instead of RELA) can only store a
9203 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9204 // which can create arbitrary 64-bit addends. (This is only a problem for
9205 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9206 // the high 32 bits of the addend.)
9207 //
9208 // This should be kept in sync with how HasRelocationAddend is initialized in
9209 // the constructor of ELFAMDGPUAsmBackend.
9210 if (!Subtarget->isAmdHsaOS())
9211 return false;
9212
9213 // We can fold offsets for anything that doesn't require a GOT relocation.
9214 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9218}
9219
9220static SDValue
9222 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9223 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9224 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9225 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9226 // lowered to the following code sequence:
9227 //
9228 // For constant address space:
9229 // s_getpc_b64 s[0:1]
9230 // s_add_u32 s0, s0, $symbol
9231 // s_addc_u32 s1, s1, 0
9232 //
9233 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9234 // a fixup or relocation is emitted to replace $symbol with a literal
9235 // constant, which is a pc-relative offset from the encoding of the $symbol
9236 // operand to the global variable.
9237 //
9238 // For global address space:
9239 // s_getpc_b64 s[0:1]
9240 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9241 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9242 //
9243 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9244 // fixups or relocations are emitted to replace $symbol@*@lo and
9245 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9246 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9247 // operand to the global variable.
9248 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9249 assert(GAFlags != SIInstrInfo::MO_NONE);
9250
9251 SDValue Ptr =
9252 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
9253 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
9254 }
9255
9256 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
9257 SDValue PtrHi;
9258 if (GAFlags == SIInstrInfo::MO_NONE)
9259 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
9260 else
9261 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
9262 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
9263}
9264
9265SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
9266 SDValue Op,
9267 SelectionDAG &DAG) const {
9268 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
9269 SDLoc DL(GSD);
9270 EVT PtrVT = Op.getValueType();
9271
9272 const GlobalValue *GV = GSD->getGlobal();
9278 GV->hasExternalLinkage()) {
9279 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
9280 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9281 // zero-sized type in other languages to declare the dynamic shared
9282 // memory which size is not known at the compile time. They will be
9283 // allocated by the runtime and placed directly after the static
9284 // allocated ones. They all share the same offset.
9285 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
9286 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9287 // Adjust alignment for that dynamic shared memory array.
9289 MFI->setDynLDSAlign(F, GVar);
9290 MFI->setUsesDynamicLDS(true);
9291 return SDValue(
9292 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
9293 }
9294 }
9296 }
9297
9299 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
9301 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
9302 }
9303
9304 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9305 if (Subtarget->has64BitLiterals()) {
9307 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9308 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9309 0);
9310 }
9311
9312 SDValue AddrLo = DAG.getTargetGlobalAddress(
9313 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9314 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9315
9316 SDValue AddrHi = DAG.getTargetGlobalAddress(
9317 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9318 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9319
9320 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9321 }
9322
9323 if (shouldEmitFixup(GV))
9324 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9325
9326 if (shouldEmitPCReloc(GV))
9327 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9329
9330 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9332 PointerType *PtrTy =
9334 const DataLayout &DataLayout = DAG.getDataLayout();
9335 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9336 MachinePointerInfo PtrInfo =
9338
9339 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9342}
9343
9344SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9345 SelectionDAG &DAG) const {
9346 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9347 const Function &Fn = DAG.getMachineFunction().getFunction();
9348 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9349 Fn, "unsupported external symbol", Op.getDebugLoc()));
9350 return DAG.getPOISON(Op.getValueType());
9351}
9352
9354 const SDLoc &DL, SDValue V) const {
9355 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9356 // the destination register.
9357 //
9358 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9359 // so we will end up with redundant moves to m0.
9360 //
9361 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9362
9363 // A Null SDValue creates a glue result.
9364 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9365 V, Chain);
9366 return SDValue(M0, 0);
9367}
9368
9369SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9370 MVT VT,
9371 unsigned Offset) const {
9372 SDLoc SL(Op);
9373 SDValue Param = lowerKernargMemParameter(
9374 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9375 // The local size values will have the hi 16-bits as zero.
9376 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9377 DAG.getValueType(VT));
9378}
9379
9381 EVT VT) {
9384 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9385 return DAG.getPOISON(VT);
9386}
9387
9389 EVT VT) {
9392 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9393 return DAG.getPOISON(VT);
9394}
9395
9397 ArrayRef<SDValue> Elts) {
9398 assert(!Elts.empty());
9399 MVT Type;
9400 unsigned NumElts = Elts.size();
9401
9402 if (NumElts <= 12) {
9403 Type = MVT::getVectorVT(MVT::f32, NumElts);
9404 } else {
9405 assert(Elts.size() <= 16);
9406 Type = MVT::v16f32;
9407 NumElts = 16;
9408 }
9409
9410 SmallVector<SDValue, 16> VecElts(NumElts);
9411 for (unsigned i = 0; i < Elts.size(); ++i) {
9412 SDValue Elt = Elts[i];
9413 if (Elt.getValueType() != MVT::f32)
9414 Elt = DAG.getBitcast(MVT::f32, Elt);
9415 VecElts[i] = Elt;
9416 }
9417 for (unsigned i = Elts.size(); i < NumElts; ++i)
9418 VecElts[i] = DAG.getPOISON(MVT::f32);
9419
9420 if (NumElts == 1)
9421 return VecElts[0];
9422 return DAG.getBuildVector(Type, DL, VecElts);
9423}
9424
9425static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9426 SDValue Src, int ExtraElts) {
9427 EVT SrcVT = Src.getValueType();
9428
9430
9431 if (SrcVT.isVector())
9432 DAG.ExtractVectorElements(Src, Elts);
9433 else
9434 Elts.push_back(Src);
9435
9436 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9437 while (ExtraElts--)
9438 Elts.push_back(Undef);
9439
9440 return DAG.getBuildVector(CastVT, DL, Elts);
9441}
9442
9443// Re-construct the required return value for a image load intrinsic.
9444// This is more complicated due to the optional use TexFailCtrl which means the
9445// required return type is an aggregate
9447 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9448 bool Unpacked, bool IsD16, int DMaskPop,
9449 int NumVDataDwords, bool IsAtomicPacked16Bit,
9450 const SDLoc &DL) {
9451 // Determine the required return type. This is the same regardless of
9452 // IsTexFail flag
9453 EVT ReqRetVT = ResultTypes[0];
9454 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9455 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9456 ? (ReqRetNumElts + 1) / 2
9457 : ReqRetNumElts;
9458
9459 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9460
9461 MVT DataDwordVT =
9462 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9463
9464 MVT MaskPopVT =
9465 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9466
9467 SDValue Data(Result, 0);
9468 SDValue TexFail;
9469
9470 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9471 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9472 if (MaskPopVT.isVector()) {
9473 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9474 SDValue(Result, 0), ZeroIdx);
9475 } else {
9476 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9477 SDValue(Result, 0), ZeroIdx);
9478 }
9479 }
9480
9481 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9482 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9483 NumDataDwords - MaskPopDwords);
9484
9485 if (IsD16)
9486 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9487
9488 EVT LegalReqRetVT = ReqRetVT;
9489 if (!ReqRetVT.isVector()) {
9490 if (!Data.getValueType().isInteger())
9491 Data = DAG.getNode(ISD::BITCAST, DL,
9492 Data.getValueType().changeTypeToInteger(), Data);
9493 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9494 } else {
9495 // We need to widen the return vector to a legal type
9496 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9497 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9498 LegalReqRetVT =
9500 ReqRetVT.getVectorNumElements() + 1);
9501 }
9502 }
9503 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9504
9505 if (IsTexFail) {
9506 TexFail =
9507 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9508 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9509
9510 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9511 }
9512
9513 if (Result->getNumValues() == 1)
9514 return Data;
9515
9516 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9517}
9518
9519static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9520 SDValue *LWE, bool &IsTexFail) {
9521 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9522
9523 uint64_t Value = TexFailCtrlConst->getZExtValue();
9524 if (Value) {
9525 IsTexFail = true;
9526 }
9527
9528 SDLoc DL(TexFailCtrlConst);
9529 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9530 Value &= ~(uint64_t)0x1;
9531 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9532 Value &= ~(uint64_t)0x2;
9533
9534 return Value == 0;
9535}
9536
9538 MVT PackVectorVT,
9539 SmallVectorImpl<SDValue> &PackedAddrs,
9540 unsigned DimIdx, unsigned EndIdx,
9541 unsigned NumGradients) {
9542 SDLoc DL(Op);
9543 for (unsigned I = DimIdx; I < EndIdx; I++) {
9544 SDValue Addr = Op.getOperand(I);
9545
9546 // Gradients are packed with undef for each coordinate.
9547 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9548 // 1D: undef,dx/dh; undef,dx/dv
9549 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9550 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9551 if (((I + 1) >= EndIdx) ||
9552 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9553 I == DimIdx + NumGradients - 1))) {
9554 if (Addr.getValueType() != MVT::i16)
9555 Addr = DAG.getBitcast(MVT::i16, Addr);
9556 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9557 } else {
9558 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9559 I++;
9560 }
9561 Addr = DAG.getBitcast(MVT::f32, Addr);
9562 PackedAddrs.push_back(Addr);
9563 }
9564}
9565
9566SDValue SITargetLowering::lowerImage(SDValue Op,
9568 SelectionDAG &DAG, bool WithChain) const {
9569 SDLoc DL(Op);
9570 MachineFunction &MF = DAG.getMachineFunction();
9571 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9572 unsigned IntrOpcode = Intr->BaseOpcode;
9573 // For image atomic: use no-return opcode if result is unused.
9574 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9575 !Op.getNode()->hasAnyUseOfValue(0))
9576 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9577 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9579 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9580 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9581 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9582 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9583
9584 SmallVector<EVT, 3> ResultTypes(Op->values());
9585 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9586 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9587 ResultTypes.erase(&ResultTypes[0]);
9588
9589 bool IsD16 = false;
9590 bool IsG16 = false;
9591 bool IsA16 = false;
9592 SDValue VData;
9593 int NumVDataDwords = 0;
9594 bool AdjustRetType = false;
9595 bool IsAtomicPacked16Bit = false;
9596
9597 // Offset of intrinsic arguments
9598 const unsigned ArgOffset = WithChain ? 2 : 1;
9599
9600 unsigned DMask;
9601 unsigned DMaskLanes = 0;
9602
9603 if (BaseOpcode->Atomic) {
9604 VData = Op.getOperand(2);
9605
9606 IsAtomicPacked16Bit =
9607 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9608 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9609 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9610 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9611
9612 bool Is64Bit = VData.getValueSizeInBits() == 64;
9613 if (BaseOpcode->AtomicX2) {
9614 SDValue VData2 = Op.getOperand(3);
9615 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9616 {VData, VData2});
9617 if (Is64Bit)
9618 VData = DAG.getBitcast(MVT::v4i32, VData);
9619
9620 if (!BaseOpcode->NoReturn)
9621 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9622
9623 DMask = Is64Bit ? 0xf : 0x3;
9624 NumVDataDwords = Is64Bit ? 4 : 2;
9625 } else {
9626 DMask = Is64Bit ? 0x3 : 0x1;
9627 NumVDataDwords = Is64Bit ? 2 : 1;
9628 }
9629 } else {
9630 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9631 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9632
9633 if (BaseOpcode->Store) {
9634 VData = Op.getOperand(2);
9635
9636 MVT StoreVT = VData.getSimpleValueType();
9637 if (StoreVT.getScalarType() == MVT::f16) {
9638 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9639 return Op; // D16 is unsupported for this instruction
9640
9641 IsD16 = true;
9642 VData = handleD16VData(VData, DAG, true);
9643 }
9644
9645 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9646 } else if (!BaseOpcode->NoReturn) {
9647 // Work out the num dwords based on the dmask popcount and underlying type
9648 // and whether packing is supported.
9649 MVT LoadVT = ResultTypes[0].getSimpleVT();
9650 if (LoadVT.getScalarType() == MVT::f16) {
9651 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9652 return Op; // D16 is unsupported for this instruction
9653
9654 IsD16 = true;
9655 }
9656
9657 // Confirm that the return type is large enough for the dmask specified
9658 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9659 (!LoadVT.isVector() && DMaskLanes > 1))
9660 return Op;
9661
9662 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9663 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9664 // instructions.
9665 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9666 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9667 NumVDataDwords = (DMaskLanes + 1) / 2;
9668 else
9669 NumVDataDwords = DMaskLanes;
9670
9671 AdjustRetType = true;
9672 }
9673 }
9674
9675 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9677
9678 // Check for 16 bit addresses or derivatives and pack if true.
9679 MVT VAddrVT =
9680 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9681 MVT VAddrScalarVT = VAddrVT.getScalarType();
9682 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9683 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9684
9685 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9686 VAddrScalarVT = VAddrVT.getScalarType();
9687 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9688 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9689
9690 // Push back extra arguments.
9691 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9692 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9693 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9694 // Special handling of bias when A16 is on. Bias is of type half but
9695 // occupies full 32-bit.
9696 SDValue Bias = DAG.getBuildVector(
9697 MVT::v2f16, DL,
9698 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9699 VAddrs.push_back(Bias);
9700 } else {
9701 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9702 "Bias needs to be converted to 16 bit in A16 mode");
9703 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9704 }
9705 }
9706
9707 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9708 // 16 bit gradients are supported, but are tied to the A16 control
9709 // so both gradients and addresses must be 16 bit
9710 LLVM_DEBUG(
9711 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9712 "require 16 bit args for both gradients and addresses");
9713 return Op;
9714 }
9715
9716 if (IsA16) {
9717 if (!ST->hasA16()) {
9718 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9719 "support 16 bit addresses\n");
9720 return Op;
9721 }
9722 }
9723
9724 // We've dealt with incorrect input so we know that if IsA16, IsG16
9725 // are set then we have to compress/pack operands (either address,
9726 // gradient or both)
9727 // In the case where a16 and gradients are tied (no G16 support) then we
9728 // have already verified that both IsA16 and IsG16 are true
9729 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9730 // Activate g16
9731 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9733 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9734 }
9735
9736 // Add gradients (packed or unpacked)
9737 if (IsG16) {
9738 // Pack the gradients
9739 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9740 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9741 ArgOffset + Intr->GradientStart,
9742 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9743 } else {
9744 for (unsigned I = ArgOffset + Intr->GradientStart;
9745 I < ArgOffset + Intr->CoordStart; I++)
9746 VAddrs.push_back(Op.getOperand(I));
9747 }
9748
9749 // Add addresses (packed or unpacked)
9750 if (IsA16) {
9751 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9752 ArgOffset + Intr->CoordStart, VAddrEnd,
9753 0 /* No gradients */);
9754 } else {
9755 // Add uncompressed address
9756 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9757 VAddrs.push_back(Op.getOperand(I));
9758 }
9759
9760 // If the register allocator cannot place the address registers contiguously
9761 // without introducing moves, then using the non-sequential address encoding
9762 // is always preferable, since it saves VALU instructions and is usually a
9763 // wash in terms of code size or even better.
9764 //
9765 // However, we currently have no way of hinting to the register allocator that
9766 // MIMG addresses should be placed contiguously when it is possible to do so,
9767 // so force non-NSA for the common 2-address case as a heuristic.
9768 //
9769 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9770 // allocation when possible.
9771 //
9772 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9773 // set of the remaining addresses.
9774 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9775 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9776 const bool UseNSA = ST->hasNSAEncoding() &&
9777 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9778 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9779 const bool UsePartialNSA =
9780 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9781
9782 SDValue VAddr;
9783 if (UsePartialNSA) {
9784 VAddr = getBuildDwordsVector(DAG, DL,
9785 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9786 } else if (!UseNSA) {
9787 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9788 }
9789
9790 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9791 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9792 SDValue Unorm;
9793 if (!BaseOpcode->Sampler) {
9794 Unorm = True;
9795 } else {
9796 uint64_t UnormConst =
9797 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9798
9799 Unorm = UnormConst ? True : False;
9800 }
9801
9802 SDValue TFE;
9803 SDValue LWE;
9804 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9805 bool IsTexFail = false;
9806 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9807 return Op;
9808
9809 if (IsTexFail) {
9810 if (!DMaskLanes) {
9811 // Expecting to get an error flag since TFC is on - and dmask is 0
9812 // Force dmask to be at least 1 otherwise the instruction will fail
9813 DMask = 0x1;
9814 DMaskLanes = 1;
9815 NumVDataDwords = 1;
9816 }
9817 NumVDataDwords += 1;
9818 AdjustRetType = true;
9819 }
9820
9821 // Has something earlier tagged that the return type needs adjusting
9822 // This happens if the instruction is a load or has set TexFailCtrl flags
9823 if (AdjustRetType) {
9824 // NumVDataDwords reflects the true number of dwords required in the return
9825 // type
9826 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9827 // This is a no-op load. This can be eliminated
9828 SDValue Undef = DAG.getPOISON(Op.getValueType());
9829 if (isa<MemSDNode>(Op))
9830 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9831 return Undef;
9832 }
9833
9834 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9835 MVT::i32, NumVDataDwords)
9836 : MVT::i32;
9837
9838 ResultTypes[0] = NewVT;
9839 if (ResultTypes.size() == 3) {
9840 // Original result was aggregate type used for TexFailCtrl results
9841 // The actual instruction returns as a vector type which has now been
9842 // created. Remove the aggregate result.
9843 ResultTypes.erase(&ResultTypes[1]);
9844 }
9845 }
9846
9847 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9848 // Keep GLC only when the atomic's result is actually used.
9849 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9851 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9853 return Op;
9854
9856 if (BaseOpcode->Store || BaseOpcode->Atomic)
9857 Ops.push_back(VData); // vdata
9858 if (UsePartialNSA) {
9859 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9860 Ops.push_back(VAddr);
9861 } else if (UseNSA)
9862 append_range(Ops, VAddrs);
9863 else
9864 Ops.push_back(VAddr);
9865 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9866 EVT RsrcVT = Rsrc.getValueType();
9867 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9868 return Op;
9869 Ops.push_back(Rsrc);
9870 if (BaseOpcode->Sampler) {
9871 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9872 if (Samp.getValueType() != MVT::v4i32)
9873 return Op;
9874 Ops.push_back(Samp);
9875 }
9876 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9877 if (IsGFX10Plus)
9878 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9879 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9880 Ops.push_back(Unorm);
9881 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9882 Ops.push_back(IsA16 && // r128, a16 for gfx9
9883 ST->hasFeature(AMDGPU::FeatureR128A16)
9884 ? True
9885 : False);
9886 if (IsGFX10Plus)
9887 Ops.push_back(IsA16 ? True : False);
9888
9889 if (!Subtarget->hasGFX90AInsts())
9890 Ops.push_back(TFE); // tfe
9891 else if (TFE->getAsZExtVal()) {
9892 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9894 "TFE is not supported on this GPU", DL.getDebugLoc()));
9895 }
9896
9897 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9898 Ops.push_back(LWE); // lwe
9899 if (!IsGFX10Plus)
9900 Ops.push_back(DimInfo->DA ? True : False);
9901 if (BaseOpcode->HasD16)
9902 Ops.push_back(IsD16 ? True : False);
9903 if (isa<MemSDNode>(Op))
9904 Ops.push_back(Op.getOperand(0)); // chain
9905
9906 int NumVAddrDwords =
9907 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9908 int Opcode = -1;
9909
9910 if (IsGFX12Plus) {
9911 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9912 NumVDataDwords, NumVAddrDwords);
9913 } else if (IsGFX11Plus) {
9914 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9915 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9916 : AMDGPU::MIMGEncGfx11Default,
9917 NumVDataDwords, NumVAddrDwords);
9918 } else if (IsGFX10Plus) {
9919 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9920 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9921 : AMDGPU::MIMGEncGfx10Default,
9922 NumVDataDwords, NumVAddrDwords);
9923 } else {
9924 if (Subtarget->hasGFX90AInsts()) {
9925 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9926 NumVDataDwords, NumVAddrDwords);
9927 if (Opcode == -1) {
9928 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9930 "requested image instruction is not supported on this GPU",
9931 DL.getDebugLoc()));
9932
9933 unsigned Idx = 0;
9934 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9935 for (EVT VT : OrigResultTypes) {
9936 if (VT == MVT::Other)
9937 RetValues[Idx++] = Op.getOperand(0); // Chain
9938 else
9939 RetValues[Idx++] = DAG.getPOISON(VT);
9940 }
9941
9942 return DAG.getMergeValues(RetValues, DL);
9943 }
9944 }
9945 if (Opcode == -1 &&
9946 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9947 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9948 NumVDataDwords, NumVAddrDwords);
9949 if (Opcode == -1)
9950 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9951 NumVDataDwords, NumVAddrDwords);
9952 }
9953 if (Opcode == -1)
9954 return Op;
9955
9956 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9957 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9958 MachineMemOperand *MemRef = MemOp->getMemOperand();
9959 DAG.setNodeMemRefs(NewNode, {MemRef});
9960 }
9961
9962 if (BaseOpcode->NoReturn) {
9963 if (BaseOpcode->Atomic)
9964 return DAG.getMergeValues(
9965 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9966
9967 return SDValue(NewNode, 0);
9968 }
9969
9970 if (BaseOpcode->AtomicX2) {
9972 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9973 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9974 }
9975
9976 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9977 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9978 NumVDataDwords, IsAtomicPacked16Bit, DL);
9979}
9980
9981SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9982 SDValue Offset, SDValue CachePolicy,
9983 SelectionDAG &DAG) const {
9984 MachineFunction &MF = DAG.getMachineFunction();
9985
9986 const DataLayout &DataLayout = DAG.getDataLayout();
9987 Align Alignment =
9988 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9989
9990 MachineMemOperand *MMO = MF.getMachineMemOperand(
9991 MachinePointerInfo(),
9994 VT.getStoreSize(), Alignment);
9995
9996 if (!Offset->isDivergent()) {
9997 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9998
9999 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
10000 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
10001 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
10002 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
10003 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10004 SDValue BufferLoad =
10005 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
10006 DAG.getVTList(MVT::i32), Ops, VT, MMO);
10007 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
10008 }
10009
10010 // Widen vec3 load to vec4.
10011 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
10012 !Subtarget->hasScalarDwordx3Loads()) {
10013 EVT WidenedVT =
10015 auto WidenedOp = DAG.getMemIntrinsicNode(
10016 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
10017 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
10018 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
10019 DAG.getVectorIdxConstant(0, DL));
10020 return Subvector;
10021 }
10022
10023 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
10024 DAG.getVTList(VT), Ops, VT, MMO);
10025 }
10026
10027 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
10028 // assume that the buffer is unswizzled.
10029 SDValue Ops[] = {
10030 DAG.getEntryNode(), // Chain
10031 Rsrc, // rsrc
10032 DAG.getConstant(0, DL, MVT::i32), // vindex
10033 {}, // voffset
10034 {}, // soffset
10035 {}, // offset
10036 CachePolicy, // cachepolicy
10037 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10038 };
10039 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10040 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
10041 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
10042 }
10043
10045 unsigned NumLoads = 1;
10046 MVT LoadVT = VT.getSimpleVT();
10047 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
10048 assert((LoadVT.getScalarType() == MVT::i32 ||
10049 LoadVT.getScalarType() == MVT::f32));
10050
10051 if (NumElts == 8 || NumElts == 16) {
10052 NumLoads = NumElts / 4;
10053 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
10054 }
10055
10056 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
10057
10058 // Use the alignment to ensure that the required offsets will fit into the
10059 // immediate offsets.
10060 setBufferOffsets(Offset, DAG, &Ops[3],
10061 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
10062
10063 uint64_t InstOffset = Ops[5]->getAsZExtVal();
10064 for (unsigned i = 0; i < NumLoads; ++i) {
10065 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
10066 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10067 LoadVT, MMO, DAG));
10068 }
10069
10070 if (NumElts == 8 || NumElts == 16)
10071 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
10072
10073 return Loads[0];
10074}
10075
10076SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10077 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10078 if (!Subtarget->hasArchitectedSGPRs())
10079 return {};
10080 SDLoc SL(Op);
10081 MVT VT = MVT::i32;
10082 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
10083 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10084 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
10085}
10086
10087SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10088 AMDGPU::Hwreg::Id HwReg,
10089 unsigned LowBit,
10090 unsigned Width) const {
10091 SDLoc SL(Op);
10092 using namespace AMDGPU::Hwreg;
10093 return {DAG.getMachineNode(
10094 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10095 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
10096 SL, MVT::i32)),
10097 0};
10098}
10099
10100SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10101 unsigned Dim,
10102 const ArgDescriptor &Arg) const {
10103 SDLoc SL(Op);
10104 MachineFunction &MF = DAG.getMachineFunction();
10105 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
10106 if (MaxID == 0)
10107 return DAG.getConstant(0, SL, MVT::i32);
10108
10109 // It's undefined behavior if a function marked with the amdgpu-no-*
10110 // attributes uses the corresponding intrinsic.
10111 if (!Arg)
10112 return DAG.getPOISON(Op->getValueType(0));
10113
10114 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
10115 SDLoc(DAG.getEntryNode()), Arg);
10116
10117 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10118 // masking operations anyway.
10119 //
10120 // TODO: We could assert the top bit is 0 for the source copy.
10121 if (Arg.isMasked())
10122 return Val;
10123
10124 // Preserve the known bits after expansion to a copy.
10125 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
10126 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
10127 DAG.getValueType(SmallVT));
10128}
10129
10130SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10131 SelectionDAG &DAG) const {
10132 MachineFunction &MF = DAG.getMachineFunction();
10133 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10134
10135 EVT VT = Op.getValueType();
10136 SDLoc DL(Op);
10137 unsigned IntrinsicID = Op.getConstantOperandVal(0);
10138
10139 // TODO: Should this propagate fast-math-flags?
10140
10141 switch (IntrinsicID) {
10142 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10143 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
10144 return emitNonHSAIntrinsicError(DAG, DL, VT);
10145 return getPreloadedValue(DAG, *MFI, VT,
10147 }
10148 case Intrinsic::amdgcn_dispatch_ptr:
10149 case Intrinsic::amdgcn_queue_ptr: {
10150 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
10151 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10152 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10153 DL.getDebugLoc()));
10154 return DAG.getPOISON(VT);
10155 }
10156
10157 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10160 return getPreloadedValue(DAG, *MFI, VT, RegID);
10161 }
10162 case Intrinsic::amdgcn_implicitarg_ptr: {
10163 if (MFI->isEntryFunction())
10164 return getImplicitArgPtr(DAG, DL);
10165 return getPreloadedValue(DAG, *MFI, VT,
10167 }
10168 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10169 if (!AMDGPU::isKernel(MF.getFunction())) {
10170 // This only makes sense to call in a kernel, so just lower to null.
10171 return DAG.getConstant(0, DL, VT);
10172 }
10173
10174 return getPreloadedValue(DAG, *MFI, VT,
10176 }
10177 case Intrinsic::amdgcn_dispatch_id: {
10178 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
10179 }
10180 case Intrinsic::amdgcn_rcp:
10181 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
10182 case Intrinsic::amdgcn_rsq:
10183 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10184 case Intrinsic::amdgcn_rsq_legacy:
10185 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10186 return emitRemovedIntrinsicError(DAG, DL, VT);
10187 return SDValue();
10188 case Intrinsic::amdgcn_rcp_legacy:
10189 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10190 return emitRemovedIntrinsicError(DAG, DL, VT);
10191 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
10192 case Intrinsic::amdgcn_rsq_clamp: {
10193 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10194 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
10195
10196 Type *Type = VT.getTypeForEVT(*DAG.getContext());
10197 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
10198 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
10199
10200 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10201 SDValue Tmp =
10202 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
10203 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
10204 DAG.getConstantFP(Min, DL, VT));
10205 }
10206 case Intrinsic::r600_read_ngroups_x:
10207 if (Subtarget->isAmdHsaOS())
10208 return emitNonHSAIntrinsicError(DAG, DL, VT);
10209
10210 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10212 false);
10213 case Intrinsic::r600_read_ngroups_y:
10214 if (Subtarget->isAmdHsaOS())
10215 return emitNonHSAIntrinsicError(DAG, DL, VT);
10216
10217 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10219 false);
10220 case Intrinsic::r600_read_ngroups_z:
10221 if (Subtarget->isAmdHsaOS())
10222 return emitNonHSAIntrinsicError(DAG, DL, VT);
10223
10224 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10226 false);
10227 case Intrinsic::r600_read_local_size_x:
10228 if (Subtarget->isAmdHsaOS())
10229 return emitNonHSAIntrinsicError(DAG, DL, VT);
10230
10231 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10233 case Intrinsic::r600_read_local_size_y:
10234 if (Subtarget->isAmdHsaOS())
10235 return emitNonHSAIntrinsicError(DAG, DL, VT);
10236
10237 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10239 case Intrinsic::r600_read_local_size_z:
10240 if (Subtarget->isAmdHsaOS())
10241 return emitNonHSAIntrinsicError(DAG, DL, VT);
10242
10243 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10245 case Intrinsic::amdgcn_workgroup_id_x:
10246 return lowerWorkGroupId(DAG, *MFI, VT,
10250 case Intrinsic::amdgcn_workgroup_id_y:
10251 return lowerWorkGroupId(DAG, *MFI, VT,
10255 case Intrinsic::amdgcn_workgroup_id_z:
10256 return lowerWorkGroupId(DAG, *MFI, VT,
10260 case Intrinsic::amdgcn_cluster_id_x:
10261 return Subtarget->hasClusters()
10262 ? getPreloadedValue(DAG, *MFI, VT,
10264 : DAG.getPOISON(VT);
10265 case Intrinsic::amdgcn_cluster_id_y:
10266 return Subtarget->hasClusters()
10267 ? getPreloadedValue(DAG, *MFI, VT,
10269 : DAG.getPOISON(VT);
10270 case Intrinsic::amdgcn_cluster_id_z:
10271 return Subtarget->hasClusters()
10272 ? getPreloadedValue(DAG, *MFI, VT,
10274 : DAG.getPOISON(VT);
10275 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10276 return Subtarget->hasClusters()
10277 ? getPreloadedValue(
10278 DAG, *MFI, VT,
10280 : DAG.getPOISON(VT);
10281 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10282 return Subtarget->hasClusters()
10283 ? getPreloadedValue(
10284 DAG, *MFI, VT,
10286 : DAG.getPOISON(VT);
10287 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10288 return Subtarget->hasClusters()
10289 ? getPreloadedValue(
10290 DAG, *MFI, VT,
10292 : DAG.getPOISON(VT);
10293 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10294 return Subtarget->hasClusters()
10295 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
10296 : SDValue();
10297 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10298 return Subtarget->hasClusters()
10299 ? getPreloadedValue(
10300 DAG, *MFI, VT,
10302 : DAG.getPOISON(VT);
10303 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10304 return Subtarget->hasClusters()
10305 ? getPreloadedValue(
10306 DAG, *MFI, VT,
10308 : DAG.getPOISON(VT);
10309 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10310 return Subtarget->hasClusters()
10311 ? getPreloadedValue(
10312 DAG, *MFI, VT,
10314 : DAG.getPOISON(VT);
10315 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10316 return Subtarget->hasClusters()
10317 ? getPreloadedValue(
10318 DAG, *MFI, VT,
10320 : DAG.getPOISON(VT);
10321 case Intrinsic::amdgcn_wave_id:
10322 return lowerWaveID(DAG, Op);
10323 case Intrinsic::amdgcn_lds_kernel_id: {
10324 if (MFI->isEntryFunction())
10325 return getLDSKernelId(DAG, DL);
10326 return getPreloadedValue(DAG, *MFI, VT,
10328 }
10329 case Intrinsic::amdgcn_workitem_id_x:
10330 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10331 case Intrinsic::amdgcn_workitem_id_y:
10332 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10333 case Intrinsic::amdgcn_workitem_id_z:
10334 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10335 case Intrinsic::amdgcn_wavefrontsize:
10336 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10337 SDLoc(Op), MVT::i32);
10338 case Intrinsic::amdgcn_s_buffer_load: {
10339 unsigned CPol = Op.getConstantOperandVal(3);
10340 // s_buffer_load, because of how it's optimized, can't be volatile
10341 // so reject ones with the volatile bit set.
10342 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10345 return Op;
10346 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10347 Op.getOperand(3), DAG);
10348 }
10349 case Intrinsic::amdgcn_fdiv_fast:
10350 return lowerFDIV_FAST(Op, DAG);
10351 case Intrinsic::amdgcn_sin:
10352 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10353
10354 case Intrinsic::amdgcn_cos:
10355 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10356
10357 case Intrinsic::amdgcn_mul_u24:
10358 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10359 Op.getOperand(2));
10360 case Intrinsic::amdgcn_mul_i24:
10361 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10362 Op.getOperand(2));
10363
10364 case Intrinsic::amdgcn_log_clamp: {
10365 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10366 return SDValue();
10367
10368 return emitRemovedIntrinsicError(DAG, DL, VT);
10369 }
10370 case Intrinsic::amdgcn_fract:
10371 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10372
10373 case Intrinsic::amdgcn_class:
10374 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10375 Op.getOperand(2));
10376 case Intrinsic::amdgcn_div_fmas:
10377 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10378 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10379
10380 case Intrinsic::amdgcn_div_fixup:
10381 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10382 Op.getOperand(2), Op.getOperand(3));
10383
10384 case Intrinsic::amdgcn_div_scale: {
10385 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10386
10387 // Translate to the operands expected by the machine instruction. The
10388 // first parameter must be the same as the first instruction.
10389 SDValue Numerator = Op.getOperand(1);
10390 SDValue Denominator = Op.getOperand(2);
10391
10392 // Note this order is opposite of the machine instruction's operations,
10393 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10394 // intrinsic has the numerator as the first operand to match a normal
10395 // division operation.
10396
10397 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10398
10399 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10400 Denominator, Numerator);
10401 }
10402 case Intrinsic::amdgcn_icmp: {
10403 // There is a Pat that handles this variant, so return it as-is.
10404 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10405 Op.getConstantOperandVal(2) == 0 &&
10406 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10407 return Op;
10408 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10409 }
10410 case Intrinsic::amdgcn_fcmp: {
10411 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10412 }
10413 case Intrinsic::amdgcn_ballot:
10414 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10415 case Intrinsic::amdgcn_fmed3:
10416 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10417 Op.getOperand(2), Op.getOperand(3));
10418 case Intrinsic::amdgcn_fdot2:
10419 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10420 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10421 case Intrinsic::amdgcn_fmul_legacy:
10422 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10423 Op.getOperand(2));
10424 case Intrinsic::amdgcn_sffbh:
10425 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10426 case Intrinsic::amdgcn_sbfe:
10427 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10428 Op.getOperand(2), Op.getOperand(3));
10429 case Intrinsic::amdgcn_ubfe:
10430 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10431 Op.getOperand(2), Op.getOperand(3));
10432 case Intrinsic::amdgcn_cvt_pkrtz:
10433 case Intrinsic::amdgcn_cvt_pknorm_i16:
10434 case Intrinsic::amdgcn_cvt_pknorm_u16:
10435 case Intrinsic::amdgcn_cvt_pk_i16:
10436 case Intrinsic::amdgcn_cvt_pk_u16: {
10437 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10438 EVT VT = Op.getValueType();
10439 unsigned Opcode;
10440
10441 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10442 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10443 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10444 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10445 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10446 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10447 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10448 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10449 else
10450 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10451
10452 if (isTypeLegal(VT))
10453 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10454
10455 SDValue Node =
10456 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10457 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10458 }
10459 case Intrinsic::amdgcn_fmad_ftz:
10460 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10461 Op.getOperand(2), Op.getOperand(3));
10462
10463 case Intrinsic::amdgcn_if_break:
10464 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10465 Op->getOperand(1), Op->getOperand(2)),
10466 0);
10467
10468 case Intrinsic::amdgcn_groupstaticsize: {
10470 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10471 return Op;
10472
10473 const Module *M = MF.getFunction().getParent();
10474 const GlobalValue *GV =
10475 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10476 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10478 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10479 }
10480 case Intrinsic::amdgcn_is_shared:
10481 case Intrinsic::amdgcn_is_private: {
10482 SDLoc SL(Op);
10483 SDValue SrcVec =
10484 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10485 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10486 DAG.getConstant(1, SL, MVT::i32));
10487
10488 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10490 : AMDGPUAS::PRIVATE_ADDRESS;
10491 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10492 Subtarget->hasGloballyAddressableScratch()) {
10493 SDValue FlatScratchBaseHi(
10494 DAG.getMachineNode(
10495 AMDGPU::S_MOV_B32, DL, MVT::i32,
10496 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10497 0);
10498 // Test bits 63..58 against the aperture address.
10499 return DAG.getSetCC(
10500 SL, MVT::i1,
10501 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10502 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10503 }
10504
10505 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10506 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10507 }
10508 case Intrinsic::amdgcn_perm:
10509 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10510 Op.getOperand(2), Op.getOperand(3));
10511 case Intrinsic::amdgcn_reloc_constant: {
10512 Module *M = MF.getFunction().getParent();
10513 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10514 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10515 auto *RelocSymbol = cast<GlobalVariable>(
10516 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10517 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10519 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10520 }
10521 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10522 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10523 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10524 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10525 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10529 if (Op.getOperand(4).getValueType() == MVT::i32)
10530 return SDValue();
10531
10532 SDLoc SL(Op);
10533 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10534 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10535 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10536 Op.getOperand(3), IndexKeyi32);
10537 }
10538 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10539 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10540 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10541 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10542 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10543 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10544 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10545 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10546 if (Op.getOperand(4).getValueType() == MVT::i64)
10547 return SDValue();
10548
10549 SDLoc SL(Op);
10550 auto IndexKeyi64 =
10551 Op.getOperand(4).getValueType() == MVT::v2i32
10552 ? DAG.getBitcast(MVT::i64, Op.getOperand(4))
10553 : DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10554 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10555 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10556 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10557 Op.getOperand(6)});
10558 }
10559 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10560 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10561 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10562 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10563 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10564 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10565 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10566 ? MVT::i64
10567 : MVT::i32;
10568 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10569 return SDValue();
10570
10571 SDLoc SL(Op);
10572 auto IndexKey =
10573 Op.getOperand(6).getValueType().isVector()
10574 ? DAG.getBitcast(IndexKeyTy, Op.getOperand(6))
10575 : DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10577 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10578 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10579 IndexKey, Op.getOperand(7), Op.getOperand(8)};
10580 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10581 Args.push_back(Op.getOperand(9));
10582 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
10583 }
10584 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10585 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10586 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10587 if (Op.getOperand(6).getValueType() == MVT::i32)
10588 return SDValue();
10589
10590 SDLoc SL(Op);
10591 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10592 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10593 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10594 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10595 IndexKeyi32, Op.getOperand(7)});
10596 }
10597 case Intrinsic::amdgcn_addrspacecast_nonnull:
10598 return lowerADDRSPACECAST(Op, DAG);
10599 case Intrinsic::amdgcn_readlane:
10600 case Intrinsic::amdgcn_readfirstlane:
10601 case Intrinsic::amdgcn_writelane:
10602 case Intrinsic::amdgcn_permlane16:
10603 case Intrinsic::amdgcn_permlanex16:
10604 case Intrinsic::amdgcn_permlane64:
10605 case Intrinsic::amdgcn_set_inactive:
10606 case Intrinsic::amdgcn_set_inactive_chain_arg:
10607 case Intrinsic::amdgcn_mov_dpp8:
10608 case Intrinsic::amdgcn_update_dpp:
10609 return lowerLaneOp(*this, Op.getNode(), DAG);
10610 case Intrinsic::amdgcn_dead: {
10612 for (const EVT ValTy : Op.getNode()->values())
10613 Poisons.push_back(DAG.getPOISON(ValTy));
10614 return DAG.getMergeValues(Poisons, SDLoc(Op));
10615 }
10616 case Intrinsic::amdgcn_wave_shuffle:
10617 return lowerWaveShuffle(*this, Op.getNode(), DAG);
10618 default:
10619 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10621 return lowerImage(Op, ImageDimIntr, DAG, false);
10622
10623 return Op;
10624 }
10625}
10626
10627// On targets not supporting constant in soffset field, turn zero to
10628// SGPR_NULL to avoid generating an extra s_mov with zero.
10630 const GCNSubtarget *Subtarget) {
10631 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10632 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10633 return SOffset;
10634}
10635
10636SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10637 SelectionDAG &DAG,
10638 unsigned NewOpcode) const {
10639 SDLoc DL(Op);
10640
10641 SDValue VData = Op.getOperand(2);
10642 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10643 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10644 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10645 SDValue Ops[] = {
10646 Op.getOperand(0), // Chain
10647 VData, // vdata
10648 Rsrc, // rsrc
10649 DAG.getConstant(0, DL, MVT::i32), // vindex
10650 VOffset, // voffset
10651 SOffset, // soffset
10652 Offset, // offset
10653 Op.getOperand(6), // cachepolicy
10654 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10655 };
10656
10657 auto *M = cast<MemSDNode>(Op);
10658
10659 EVT MemVT = VData.getValueType();
10660 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10661 M->getMemOperand());
10662}
10663
10664SDValue
10665SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10666 unsigned NewOpcode) const {
10667 SDLoc DL(Op);
10668
10669 SDValue VData = Op.getOperand(2);
10670 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10671 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10672 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10673 SDValue Ops[] = {
10674 Op.getOperand(0), // Chain
10675 VData, // vdata
10676 Rsrc, // rsrc
10677 Op.getOperand(4), // vindex
10678 VOffset, // voffset
10679 SOffset, // soffset
10680 Offset, // offset
10681 Op.getOperand(7), // cachepolicy
10682 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10683 };
10684
10685 auto *M = cast<MemSDNode>(Op);
10686
10687 EVT MemVT = VData.getValueType();
10688 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10689 M->getMemOperand());
10690}
10691
10692SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10693 SelectionDAG &DAG) const {
10694 unsigned IntrID = Op.getConstantOperandVal(1);
10695 SDLoc DL(Op);
10696
10697 switch (IntrID) {
10698 case Intrinsic::amdgcn_ds_ordered_add:
10699 case Intrinsic::amdgcn_ds_ordered_swap: {
10700 MemSDNode *M = cast<MemSDNode>(Op);
10701 SDValue Chain = M->getOperand(0);
10702 SDValue M0 = M->getOperand(2);
10703 SDValue Value = M->getOperand(3);
10704 unsigned IndexOperand = M->getConstantOperandVal(7);
10705 unsigned WaveRelease = M->getConstantOperandVal(8);
10706 unsigned WaveDone = M->getConstantOperandVal(9);
10707
10708 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10709 IndexOperand &= ~0x3f;
10710 unsigned CountDw = 0;
10711
10712 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10713 CountDw = (IndexOperand >> 24) & 0xf;
10714 IndexOperand &= ~(0xf << 24);
10715
10716 if (CountDw < 1 || CountDw > 4) {
10717 const Function &Fn = DAG.getMachineFunction().getFunction();
10718 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10719 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10720 DL.getDebugLoc()));
10721 CountDw = 1;
10722 }
10723 }
10724
10725 if (IndexOperand) {
10726 const Function &Fn = DAG.getMachineFunction().getFunction();
10727 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10728 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10729 }
10730
10731 if (WaveDone && !WaveRelease) {
10732 // TODO: Move this to IR verifier
10733 const Function &Fn = DAG.getMachineFunction().getFunction();
10734 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10735 Fn, "ds_ordered_count: wave_done requires wave_release",
10736 DL.getDebugLoc()));
10737 }
10738
10739 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10740 unsigned ShaderType =
10742 unsigned Offset0 = OrderedCountIndex << 2;
10743 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10744
10745 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10746 Offset1 |= (CountDw - 1) << 6;
10747
10748 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10749 Offset1 |= ShaderType << 2;
10750
10751 unsigned Offset = Offset0 | (Offset1 << 8);
10752
10753 SDValue Ops[] = {
10754 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10755 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10756 };
10757 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10758 M->getVTList(), Ops, M->getMemoryVT(),
10759 M->getMemOperand());
10760 }
10761 case Intrinsic::amdgcn_raw_buffer_load:
10762 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10763 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10764 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10765 case Intrinsic::amdgcn_raw_buffer_load_format:
10766 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10767 const bool IsFormat =
10768 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10769 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10770
10771 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10772 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10773 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10774 SDValue Ops[] = {
10775 Op.getOperand(0), // Chain
10776 Rsrc, // rsrc
10777 DAG.getConstant(0, DL, MVT::i32), // vindex
10778 VOffset, // voffset
10779 SOffset, // soffset
10780 Offset, // offset
10781 Op.getOperand(5), // cachepolicy, swizzled buffer
10782 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10783 };
10784
10785 auto *M = cast<MemSDNode>(Op);
10786 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10787 }
10788 case Intrinsic::amdgcn_struct_buffer_load:
10789 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10790 case Intrinsic::amdgcn_struct_buffer_load_format:
10791 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10792 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10793 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10794 const bool IsFormat =
10795 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10796 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10797
10798 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10799 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10800 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10801 SDValue Ops[] = {
10802 Op.getOperand(0), // Chain
10803 Rsrc, // rsrc
10804 Op.getOperand(3), // vindex
10805 VOffset, // voffset
10806 SOffset, // soffset
10807 Offset, // offset
10808 Op.getOperand(6), // cachepolicy, swizzled buffer
10809 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10810 };
10811
10812 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10813 }
10814 case Intrinsic::amdgcn_raw_tbuffer_load:
10815 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10816 MemSDNode *M = cast<MemSDNode>(Op);
10817 EVT LoadVT = Op.getValueType();
10818 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10819 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10820 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10821
10822 SDValue Ops[] = {
10823 Op.getOperand(0), // Chain
10824 Rsrc, // rsrc
10825 DAG.getConstant(0, DL, MVT::i32), // vindex
10826 VOffset, // voffset
10827 SOffset, // soffset
10828 Offset, // offset
10829 Op.getOperand(5), // format
10830 Op.getOperand(6), // cachepolicy, swizzled buffer
10831 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10832 };
10833
10834 if (LoadVT.getScalarType() == MVT::f16)
10835 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10836 Ops);
10837 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10838 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10839 DAG);
10840 }
10841 case Intrinsic::amdgcn_struct_tbuffer_load:
10842 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10843 MemSDNode *M = cast<MemSDNode>(Op);
10844 EVT LoadVT = Op.getValueType();
10845 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10846 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10847 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10848
10849 SDValue Ops[] = {
10850 Op.getOperand(0), // Chain
10851 Rsrc, // rsrc
10852 Op.getOperand(3), // vindex
10853 VOffset, // voffset
10854 SOffset, // soffset
10855 Offset, // offset
10856 Op.getOperand(6), // format
10857 Op.getOperand(7), // cachepolicy, swizzled buffer
10858 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10859 };
10860
10861 if (LoadVT.getScalarType() == MVT::f16)
10862 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10863 Ops);
10864 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10865 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10866 DAG);
10867 }
10868 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10869 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10870 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10871 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10872 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10873 return lowerStructBufferAtomicIntrin(Op, DAG,
10874 AMDGPUISD::BUFFER_ATOMIC_FADD);
10875 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10876 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10877 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10878 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10880 return lowerStructBufferAtomicIntrin(Op, DAG,
10881 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10882 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10883 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10884 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10885 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10886 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10887 return lowerStructBufferAtomicIntrin(Op, DAG,
10888 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10889 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10890 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10891 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10892 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10893 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10894 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10895 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10897 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10898 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10900 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10901 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10903 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10904 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10905 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10906 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10907 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10908 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10909 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10910 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10911 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10912 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10913 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10915 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10916 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10918 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10919 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10920 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10921 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10922 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10923 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10924 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10925 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10926 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10927 return lowerStructBufferAtomicIntrin(Op, DAG,
10928 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10929 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10930 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10931 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10932 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10934 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10935 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10937 return lowerStructBufferAtomicIntrin(Op, DAG,
10938 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10939 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10941 return lowerStructBufferAtomicIntrin(Op, DAG,
10942 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10943 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10945 return lowerStructBufferAtomicIntrin(Op, DAG,
10946 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10947 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10949 return lowerStructBufferAtomicIntrin(Op, DAG,
10950 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10951 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10952 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10953 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10954 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10955 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10956 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10957 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10959 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10960 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10961 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10962 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10963 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10965 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10966 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10968 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10969 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10970 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10971 return lowerStructBufferAtomicIntrin(Op, DAG,
10972 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10973 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10974 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10975 return lowerRawBufferAtomicIntrin(Op, DAG,
10976 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10977 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10978 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10979 return lowerStructBufferAtomicIntrin(Op, DAG,
10980 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10981 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10982 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10983 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10984 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10985 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10986 SDValue Ops[] = {
10987 Op.getOperand(0), // Chain
10988 Op.getOperand(2), // src
10989 Op.getOperand(3), // cmp
10990 Rsrc, // rsrc
10991 DAG.getConstant(0, DL, MVT::i32), // vindex
10992 VOffset, // voffset
10993 SOffset, // soffset
10994 Offset, // offset
10995 Op.getOperand(7), // cachepolicy
10996 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10997 };
10998 EVT VT = Op.getValueType();
10999 auto *M = cast<MemSDNode>(Op);
11000
11001 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11002 Op->getVTList(), Ops, VT,
11003 M->getMemOperand());
11004 }
11005 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11006 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11007 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
11008 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
11009 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
11010 SDValue Ops[] = {
11011 Op.getOperand(0), // Chain
11012 Op.getOperand(2), // src
11013 Op.getOperand(3), // cmp
11014 Rsrc, // rsrc
11015 Op.getOperand(5), // vindex
11016 VOffset, // voffset
11017 SOffset, // soffset
11018 Offset, // offset
11019 Op.getOperand(8), // cachepolicy
11020 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11021 };
11022 EVT VT = Op.getValueType();
11023 auto *M = cast<MemSDNode>(Op);
11024
11025 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11026 Op->getVTList(), Ops, VT,
11027 M->getMemOperand());
11028 }
11029 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11030 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11031 MemSDNode *M = cast<MemSDNode>(Op);
11032 SDValue NodePtr = M->getOperand(2);
11033 SDValue RayExtent = M->getOperand(3);
11034 SDValue InstanceMask = M->getOperand(4);
11035 SDValue RayOrigin = M->getOperand(5);
11036 SDValue RayDir = M->getOperand(6);
11037 SDValue Offsets = M->getOperand(7);
11038 SDValue TDescr = M->getOperand(8);
11039
11040 assert(NodePtr.getValueType() == MVT::i64);
11041 assert(RayDir.getValueType() == MVT::v3f32);
11042
11043 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11044 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11045 return SDValue();
11046 }
11047
11048 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11049 const unsigned NumVDataDwords = 10;
11050 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11051 int Opcode = AMDGPU::getMIMGOpcode(
11052 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11053 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11054 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11055 assert(Opcode != -1);
11056
11058 Ops.push_back(NodePtr);
11059 Ops.push_back(DAG.getBuildVector(
11060 MVT::v2i32, DL,
11061 {DAG.getBitcast(MVT::i32, RayExtent),
11062 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11063 Ops.push_back(RayOrigin);
11064 Ops.push_back(RayDir);
11065 Ops.push_back(Offsets);
11066 Ops.push_back(TDescr);
11067 Ops.push_back(M->getChain());
11068
11069 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11070 MachineMemOperand *MemRef = M->getMemOperand();
11071 DAG.setNodeMemRefs(NewNode, {MemRef});
11072 return SDValue(NewNode, 0);
11073 }
11074 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11075 MemSDNode *M = cast<MemSDNode>(Op);
11076 SDValue NodePtr = M->getOperand(2);
11077 SDValue RayExtent = M->getOperand(3);
11078 SDValue RayOrigin = M->getOperand(4);
11079 SDValue RayDir = M->getOperand(5);
11080 SDValue RayInvDir = M->getOperand(6);
11081 SDValue TDescr = M->getOperand(7);
11082
11083 assert(NodePtr.getValueType() == MVT::i32 ||
11084 NodePtr.getValueType() == MVT::i64);
11085 assert(RayDir.getValueType() == MVT::v3f16 ||
11086 RayDir.getValueType() == MVT::v3f32);
11087
11088 if (!Subtarget->hasGFX10_AEncoding()) {
11089 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11090 return SDValue();
11091 }
11092
11093 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
11094 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
11095 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11096 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11097 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11098 const unsigned NumVDataDwords = 4;
11099 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11100 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11101 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11102 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11103 IsGFX12Plus;
11104 const unsigned BaseOpcodes[2][2] = {
11105 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11106 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11107 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11108 int Opcode;
11109 if (UseNSA) {
11110 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11111 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11112 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11113 : AMDGPU::MIMGEncGfx10NSA,
11114 NumVDataDwords, NumVAddrDwords);
11115 } else {
11116 assert(!IsGFX12Plus);
11117 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11118 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11119 : AMDGPU::MIMGEncGfx10Default,
11120 NumVDataDwords, NumVAddrDwords);
11121 }
11122 assert(Opcode != -1);
11123
11125
11126 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11128 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
11129 if (Lanes[0].getValueSizeInBits() == 32) {
11130 for (unsigned I = 0; I < 3; ++I)
11131 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
11132 } else {
11133 if (IsAligned) {
11134 Ops.push_back(DAG.getBitcast(
11135 MVT::i32,
11136 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
11137 Ops.push_back(Lanes[2]);
11138 } else {
11139 SDValue Elt0 = Ops.pop_back_val();
11140 Ops.push_back(DAG.getBitcast(
11141 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
11142 Ops.push_back(DAG.getBitcast(
11143 MVT::i32,
11144 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
11145 }
11146 }
11147 };
11148
11149 if (UseNSA && IsGFX11Plus) {
11150 Ops.push_back(NodePtr);
11151 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11152 Ops.push_back(RayOrigin);
11153 if (IsA16) {
11154 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11155 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
11156 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
11157 for (unsigned I = 0; I < 3; ++I) {
11158 MergedLanes.push_back(DAG.getBitcast(
11159 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
11160 {DirLanes[I], InvDirLanes[I]})));
11161 }
11162 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
11163 } else {
11164 Ops.push_back(RayDir);
11165 Ops.push_back(RayInvDir);
11166 }
11167 } else {
11168 if (Is64)
11169 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
11170 2);
11171 else
11172 Ops.push_back(NodePtr);
11173
11174 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11175 packLanes(RayOrigin, true);
11176 packLanes(RayDir, true);
11177 packLanes(RayInvDir, false);
11178 }
11179
11180 if (!UseNSA) {
11181 // Build a single vector containing all the operands so far prepared.
11182 if (NumVAddrDwords > 12) {
11183 SDValue Undef = DAG.getPOISON(MVT::i32);
11184 Ops.append(16 - Ops.size(), Undef);
11185 }
11186 assert(Ops.size() >= 8 && Ops.size() <= 12);
11187 SDValue MergedOps =
11188 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
11189 Ops.clear();
11190 Ops.push_back(MergedOps);
11191 }
11192
11193 Ops.push_back(TDescr);
11194 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
11195 Ops.push_back(M->getChain());
11196
11197 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11198 MachineMemOperand *MemRef = M->getMemOperand();
11199 DAG.setNodeMemRefs(NewNode, {MemRef});
11200 return SDValue(NewNode, 0);
11201 }
11202 case Intrinsic::amdgcn_global_atomic_fmin_num:
11203 case Intrinsic::amdgcn_global_atomic_fmax_num:
11204 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11205 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11206 MemSDNode *M = cast<MemSDNode>(Op);
11207 SDValue Ops[] = {
11208 M->getOperand(0), // Chain
11209 M->getOperand(2), // Ptr
11210 M->getOperand(3) // Value
11211 };
11212 unsigned Opcode = 0;
11213 switch (IntrID) {
11214 case Intrinsic::amdgcn_global_atomic_fmin_num:
11215 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11216 Opcode = ISD::ATOMIC_LOAD_FMIN;
11217 break;
11218 }
11219 case Intrinsic::amdgcn_global_atomic_fmax_num:
11220 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11221 Opcode = ISD::ATOMIC_LOAD_FMAX;
11222 break;
11223 }
11224 default:
11225 llvm_unreachable("unhandled atomic opcode");
11226 }
11227 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
11228 Ops, M->getMemOperand());
11229 }
11230 case Intrinsic::amdgcn_s_alloc_vgpr: {
11231 SDValue NumVGPRs = Op.getOperand(2);
11232 if (!NumVGPRs->isDivergent())
11233 return Op;
11234
11235 SDValue ReadFirstLaneID =
11236 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
11237 NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
11238 ReadFirstLaneID, NumVGPRs);
11239
11240 return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
11241 Op.getOperand(0), Op.getOperand(1), NumVGPRs);
11242 }
11243 case Intrinsic::amdgcn_s_get_barrier_state:
11244 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11245 SDValue Chain = Op->getOperand(0);
11247 unsigned Opc;
11248
11249 if (isa<ConstantSDNode>(Op->getOperand(2))) {
11250 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
11251 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11252 BarID = (BarID >> 4) & 0x3F;
11253 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11254 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11255 Ops.push_back(K);
11256 Ops.push_back(Chain);
11257 } else {
11258 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11259 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11260 SDValue M0Val;
11261 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
11262 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11263 M0Val = SDValue(
11264 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11265 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11266 0);
11267 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11268 } else
11269 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
11270 }
11271
11272 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11273 return SDValue(NewMI, 0);
11274 }
11275 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11276 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11277 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11278 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11279 SDValue Chain = Op->getOperand(0);
11280 SDValue Ptr = Op->getOperand(2);
11281 EVT VT = Op->getValueType(0);
11282 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
11283 Chain, Ptr, MII->getMemOperand());
11284 }
11285 case Intrinsic::amdgcn_flat_load_monitor_b32:
11286 case Intrinsic::amdgcn_flat_load_monitor_b64:
11287 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11288 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11289 SDValue Chain = Op->getOperand(0);
11290 SDValue Ptr = Op->getOperand(2);
11291 return DAG.getMemIntrinsicNode(AMDGPUISD::FLAT_LOAD_MONITOR, DL,
11292 Op->getVTList(), {Chain, Ptr},
11293 MII->getMemoryVT(), MII->getMemOperand());
11294 }
11295 case Intrinsic::amdgcn_global_load_monitor_b32:
11296 case Intrinsic::amdgcn_global_load_monitor_b64:
11297 case Intrinsic::amdgcn_global_load_monitor_b128: {
11298 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11299 SDValue Chain = Op->getOperand(0);
11300 SDValue Ptr = Op->getOperand(2);
11301 return DAG.getMemIntrinsicNode(AMDGPUISD::GLOBAL_LOAD_MONITOR, DL,
11302 Op->getVTList(), {Chain, Ptr},
11303 MII->getMemoryVT(), MII->getMemOperand());
11304 }
11305 default:
11306
11307 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11309 return lowerImage(Op, ImageDimIntr, DAG, true);
11310
11311 return SDValue();
11312 }
11313}
11314
11315// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11316// dwordx4 if on SI and handle TFE loads.
11317SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11318 SDVTList VTList,
11319 ArrayRef<SDValue> Ops, EVT MemVT,
11320 MachineMemOperand *MMO,
11321 SelectionDAG &DAG) const {
11322 LLVMContext &C = *DAG.getContext();
11323 MachineFunction &MF = DAG.getMachineFunction();
11324 EVT VT = VTList.VTs[0];
11325
11326 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11327 bool IsTFE = VTList.NumVTs == 3;
11328 if (IsTFE) {
11329 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
11330 unsigned NumOpDWords = NumValueDWords + 1;
11331 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
11332 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
11333 MachineMemOperand *OpDWordsMMO =
11334 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
11335 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
11336 OpDWordsVT, OpDWordsMMO, DAG);
11337 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11338 DAG.getVectorIdxConstant(NumValueDWords, DL));
11339 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
11340 SDValue ValueDWords =
11341 NumValueDWords == 1
11342 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
11344 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
11345 ZeroIdx);
11346 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
11347 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11348 }
11349
11350 if (!Subtarget->hasDwordx3LoadStores() &&
11351 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11352 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
11353 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
11354 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
11355 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
11356 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
11357 WidenedMemVT, WidenedMMO);
11359 DAG.getVectorIdxConstant(0, DL));
11360 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
11361 }
11362
11363 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
11364}
11365
11366SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11367 bool ImageStore) const {
11368 EVT StoreVT = VData.getValueType();
11369
11370 // No change for f16 and legal vector D16 types.
11371 if (!StoreVT.isVector())
11372 return VData;
11373
11374 SDLoc DL(VData);
11375 unsigned NumElements = StoreVT.getVectorNumElements();
11376
11377 if (Subtarget->hasUnpackedD16VMem()) {
11378 // We need to unpack the packed data to store.
11379 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11380 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11381
11382 EVT EquivStoreVT =
11383 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11384 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11385 return DAG.UnrollVectorOp(ZExt.getNode());
11386 }
11387
11388 // The sq block of gfx8.1 does not estimate register use correctly for d16
11389 // image store instructions. The data operand is computed as if it were not a
11390 // d16 image instruction.
11391 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11392 // Bitcast to i16
11393 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11394 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11395
11396 // Decompose into scalars
11398 DAG.ExtractVectorElements(IntVData, Elts);
11399
11400 // Group pairs of i16 into v2i16 and bitcast to i32
11401 SmallVector<SDValue, 4> PackedElts;
11402 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11403 SDValue Pair =
11404 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11405 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11406 PackedElts.push_back(IntPair);
11407 }
11408 if ((NumElements % 2) == 1) {
11409 // Handle v3i16
11410 unsigned I = Elts.size() / 2;
11411 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11412 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11413 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11414 PackedElts.push_back(IntPair);
11415 }
11416
11417 // Pad using UNDEF
11418 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11419
11420 // Build final vector
11421 EVT VecVT =
11422 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11423 return DAG.getBuildVector(VecVT, DL, PackedElts);
11424 }
11425
11426 if (NumElements == 3) {
11427 EVT IntStoreVT =
11429 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11430
11431 EVT WidenedStoreVT = EVT::getVectorVT(
11432 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11433 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11434 WidenedStoreVT.getStoreSizeInBits());
11435 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11436 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11437 }
11438
11439 assert(isTypeLegal(StoreVT));
11440 return VData;
11441}
11442
11443static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
11444 switch (Intr) {
11445 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11446 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11447 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11448 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11449 case Intrinsic::amdgcn_load_async_to_lds:
11450 case Intrinsic::amdgcn_global_load_async_lds:
11451 return true;
11452 }
11453 return false;
11454}
11455
11456SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11457 SelectionDAG &DAG) const {
11458 SDLoc DL(Op);
11459 SDValue Chain = Op.getOperand(0);
11460 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11461
11462 switch (IntrinsicID) {
11463 case Intrinsic::amdgcn_exp_compr: {
11464 if (!Subtarget->hasCompressedExport()) {
11465 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11467 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11468 }
11469 SDValue Src0 = Op.getOperand(4);
11470 SDValue Src1 = Op.getOperand(5);
11471 // Hack around illegal type on SI by directly selecting it.
11472 if (isTypeLegal(Src0.getValueType()))
11473 return SDValue();
11474
11475 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11476 SDValue Undef = DAG.getPOISON(MVT::f32);
11477 const SDValue Ops[] = {
11478 Op.getOperand(2), // tgt
11479 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11480 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11481 Undef, // src2
11482 Undef, // src3
11483 Op.getOperand(7), // vm
11484 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11485 Op.getOperand(3), // en
11486 Op.getOperand(0) // Chain
11487 };
11488
11489 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11490 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11491 }
11492
11493 case Intrinsic::amdgcn_struct_tbuffer_store:
11494 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11495 SDValue VData = Op.getOperand(2);
11496 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11497 if (IsD16)
11498 VData = handleD16VData(VData, DAG);
11499 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11500 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11501 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11502 SDValue Ops[] = {
11503 Chain,
11504 VData, // vdata
11505 Rsrc, // rsrc
11506 Op.getOperand(4), // vindex
11507 VOffset, // voffset
11508 SOffset, // soffset
11509 Offset, // offset
11510 Op.getOperand(7), // format
11511 Op.getOperand(8), // cachepolicy, swizzled buffer
11512 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11513 };
11514 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11515 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11516 MemSDNode *M = cast<MemSDNode>(Op);
11517 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11518 M->getMemoryVT(), M->getMemOperand());
11519 }
11520
11521 case Intrinsic::amdgcn_raw_tbuffer_store:
11522 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11523 SDValue VData = Op.getOperand(2);
11524 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11525 if (IsD16)
11526 VData = handleD16VData(VData, DAG);
11527 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11528 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11529 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11530 SDValue Ops[] = {
11531 Chain,
11532 VData, // vdata
11533 Rsrc, // rsrc
11534 DAG.getConstant(0, DL, MVT::i32), // vindex
11535 VOffset, // voffset
11536 SOffset, // soffset
11537 Offset, // offset
11538 Op.getOperand(6), // format
11539 Op.getOperand(7), // cachepolicy, swizzled buffer
11540 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11541 };
11542 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11543 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11544 MemSDNode *M = cast<MemSDNode>(Op);
11545 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11546 M->getMemoryVT(), M->getMemOperand());
11547 }
11548
11549 case Intrinsic::amdgcn_raw_buffer_store:
11550 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11551 case Intrinsic::amdgcn_raw_buffer_store_format:
11552 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11553 const bool IsFormat =
11554 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11555 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11556
11557 SDValue VData = Op.getOperand(2);
11558 EVT VDataVT = VData.getValueType();
11559 EVT EltType = VDataVT.getScalarType();
11560 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11561 if (IsD16) {
11562 VData = handleD16VData(VData, DAG);
11563 VDataVT = VData.getValueType();
11564 }
11565
11566 if (!isTypeLegal(VDataVT)) {
11567 VData =
11568 DAG.getNode(ISD::BITCAST, DL,
11569 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11570 }
11571
11572 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11573 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11574 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11575 SDValue Ops[] = {
11576 Chain,
11577 VData,
11578 Rsrc,
11579 DAG.getConstant(0, DL, MVT::i32), // vindex
11580 VOffset, // voffset
11581 SOffset, // soffset
11582 Offset, // offset
11583 Op.getOperand(6), // cachepolicy, swizzled buffer
11584 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11585 };
11586 unsigned Opc =
11587 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11588 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11589 MemSDNode *M = cast<MemSDNode>(Op);
11590
11591 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11592 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11593 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11594
11595 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11596 M->getMemoryVT(), M->getMemOperand());
11597 }
11598
11599 case Intrinsic::amdgcn_struct_buffer_store:
11600 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11601 case Intrinsic::amdgcn_struct_buffer_store_format:
11602 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11603 const bool IsFormat =
11604 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11605 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11606
11607 SDValue VData = Op.getOperand(2);
11608 EVT VDataVT = VData.getValueType();
11609 EVT EltType = VDataVT.getScalarType();
11610 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11611
11612 if (IsD16) {
11613 VData = handleD16VData(VData, DAG);
11614 VDataVT = VData.getValueType();
11615 }
11616
11617 if (!isTypeLegal(VDataVT)) {
11618 VData =
11619 DAG.getNode(ISD::BITCAST, DL,
11620 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11621 }
11622
11623 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11624 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11625 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11626 SDValue Ops[] = {
11627 Chain,
11628 VData,
11629 Rsrc,
11630 Op.getOperand(4), // vindex
11631 VOffset, // voffset
11632 SOffset, // soffset
11633 Offset, // offset
11634 Op.getOperand(7), // cachepolicy, swizzled buffer
11635 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11636 };
11637 unsigned Opc =
11638 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11639 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11640 MemSDNode *M = cast<MemSDNode>(Op);
11641
11642 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11643 EVT VDataType = VData.getValueType().getScalarType();
11644 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11645 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11646
11647 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11648 M->getMemoryVT(), M->getMemOperand());
11649 }
11650 case Intrinsic::amdgcn_raw_buffer_load_lds:
11651 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11652 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11653 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11654 case Intrinsic::amdgcn_struct_buffer_load_lds:
11655 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11656 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
11657 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
11658 if (!Subtarget->hasVMemToLDSLoad())
11659 return SDValue();
11660 unsigned Opc;
11661 bool HasVIndex =
11662 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11663 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
11664 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
11665 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
11666 unsigned OpOffset = HasVIndex ? 1 : 0;
11667 SDValue VOffset = Op.getOperand(5 + OpOffset);
11668 bool HasVOffset = !isNullConstant(VOffset);
11669 unsigned Size = Op->getConstantOperandVal(4);
11670
11671 switch (Size) {
11672 default:
11673 return SDValue();
11674 case 1:
11675 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11676 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11677 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11678 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11679 break;
11680 case 2:
11681 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11682 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11683 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11684 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11685 break;
11686 case 4:
11687 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11688 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11689 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11690 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11691 break;
11692 case 12:
11693 if (!Subtarget->hasLDSLoadB96_B128())
11694 return SDValue();
11695 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11696 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11697 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11698 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11699 break;
11700 case 16:
11701 if (!Subtarget->hasLDSLoadB96_B128())
11702 return SDValue();
11703 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11704 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11705 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11706 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11707 break;
11708 }
11709
11710 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11711
11713
11714 if (HasVIndex && HasVOffset)
11715 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11716 {Op.getOperand(5), // VIndex
11717 VOffset}));
11718 else if (HasVIndex)
11719 Ops.push_back(Op.getOperand(5));
11720 else if (HasVOffset)
11721 Ops.push_back(VOffset);
11722
11723 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11724 Ops.push_back(Rsrc);
11725 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11726 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11727 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11728 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11729 Ops.push_back(DAG.getTargetConstant(
11730 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11731 DL, MVT::i8)); // cpol
11732 Ops.push_back(DAG.getTargetConstant(
11733 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11734 ? 1
11735 : 0,
11736 DL, MVT::i8)); // swz
11737 Ops.push_back(
11738 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
11739 Ops.push_back(M0Val.getValue(0)); // Chain
11740 Ops.push_back(M0Val.getValue(1)); // Glue
11741
11742 auto *M = cast<MemSDNode>(Op);
11743 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11744 DAG.setNodeMemRefs(Load, M->memoperands());
11745
11746 return SDValue(Load, 0);
11747 }
11748 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11749 // for "trust me" that the remaining cases are global pointers until
11750 // such time as we can put two mem operands on an intrinsic.
11751 case Intrinsic::amdgcn_load_to_lds:
11752 case Intrinsic::amdgcn_load_async_to_lds:
11753 case Intrinsic::amdgcn_global_load_lds:
11754 case Intrinsic::amdgcn_global_load_async_lds: {
11755 if (!Subtarget->hasVMemToLDSLoad())
11756 return SDValue();
11757
11758 unsigned Opc;
11759 unsigned Size = Op->getConstantOperandVal(4);
11760 switch (Size) {
11761 default:
11762 return SDValue();
11763 case 1:
11764 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11765 break;
11766 case 2:
11767 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11768 break;
11769 case 4:
11770 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11771 break;
11772 case 12:
11773 if (!Subtarget->hasLDSLoadB96_B128())
11774 return SDValue();
11775 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11776 break;
11777 case 16:
11778 if (!Subtarget->hasLDSLoadB96_B128())
11779 return SDValue();
11780 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11781 break;
11782 }
11783
11784 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11785
11787
11788 SDValue Addr = Op.getOperand(2); // Global ptr
11789 SDValue VOffset;
11790 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11791 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11792 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11793 SDValue LHS = Addr.getOperand(0);
11794 SDValue RHS = Addr.getOperand(1);
11795
11796 if (LHS->isDivergent())
11797 std::swap(LHS, RHS);
11798
11799 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11800 RHS.getOperand(0).getValueType() == MVT::i32) {
11801 // add (i64 sgpr), (zero_extend (i32 vgpr))
11802 Addr = LHS;
11803 VOffset = RHS.getOperand(0);
11804 }
11805 }
11806
11807 Ops.push_back(Addr);
11808 if (!Addr->isDivergent()) {
11810 if (!VOffset)
11811 VOffset =
11812 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11813 DAG.getTargetConstant(0, DL, MVT::i32)),
11814 0);
11815 Ops.push_back(VOffset);
11816 }
11817
11818 Ops.push_back(Op.getOperand(5)); // Offset
11819
11820 unsigned Aux = Op.getConstantOperandVal(6);
11821 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11822 MVT::i32)); // CPol
11823 Ops.push_back(
11824 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
11825
11826 Ops.push_back(M0Val.getValue(0)); // Chain
11827 Ops.push_back(M0Val.getValue(1)); // Glue
11828
11829 auto *M = cast<MemSDNode>(Op);
11830 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11831 DAG.setNodeMemRefs(Load, M->memoperands());
11832
11833 return SDValue(Load, 0);
11834 }
11835 case Intrinsic::amdgcn_end_cf:
11836 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11837 Op->getOperand(2), Chain),
11838 0);
11839 case Intrinsic::amdgcn_s_barrier_init:
11840 case Intrinsic::amdgcn_s_barrier_signal_var: {
11841 // these two intrinsics have two operands: barrier pointer and member count
11842 SDValue Chain = Op->getOperand(0);
11844 SDValue BarOp = Op->getOperand(2);
11845 SDValue CntOp = Op->getOperand(3);
11846 SDValue M0Val;
11847 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11848 ? AMDGPU::S_BARRIER_INIT_M0
11849 : AMDGPU::S_BARRIER_SIGNAL_M0;
11850 // extract the BarrierID from bits 4-9 of BarOp
11851 SDValue BarID;
11852 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11853 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11854 BarID =
11855 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11856 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11857 0);
11858 // Member count should be put into M0[ShAmt:+6]
11859 // Barrier ID should be put into M0[5:0]
11860 M0Val =
11861 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11862 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11863 0);
11864 constexpr unsigned ShAmt = 16;
11865 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11866 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11867
11868 M0Val = SDValue(
11869 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11870
11871 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11872
11873 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11874 return SDValue(NewMI, 0);
11875 }
11876 case Intrinsic::amdgcn_s_wakeup_barrier: {
11877 if (!Subtarget->hasSWakeupBarrier())
11878 return SDValue();
11879 [[fallthrough]];
11880 }
11881 case Intrinsic::amdgcn_s_barrier_join: {
11882 // these three intrinsics have one operand: barrier pointer
11883 SDValue Chain = Op->getOperand(0);
11885 SDValue BarOp = Op->getOperand(2);
11886 unsigned Opc;
11887
11888 if (isa<ConstantSDNode>(BarOp)) {
11889 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11890 switch (IntrinsicID) {
11891 default:
11892 return SDValue();
11893 case Intrinsic::amdgcn_s_barrier_join:
11894 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11895 break;
11896 case Intrinsic::amdgcn_s_wakeup_barrier:
11897 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11898 break;
11899 }
11900 // extract the BarrierID from bits 4-9 of the immediate
11901 unsigned BarID = (BarVal >> 4) & 0x3F;
11902 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11903 Ops.push_back(K);
11904 Ops.push_back(Chain);
11905 } else {
11906 switch (IntrinsicID) {
11907 default:
11908 return SDValue();
11909 case Intrinsic::amdgcn_s_barrier_join:
11910 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11911 break;
11912 case Intrinsic::amdgcn_s_wakeup_barrier:
11913 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11914 break;
11915 }
11916 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11917 SDValue M0Val;
11918 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11919 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11920 M0Val =
11921 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11922 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11923 0);
11924 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11925 }
11926
11927 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11928 return SDValue(NewMI, 0);
11929 }
11930 case Intrinsic::amdgcn_s_prefetch_data: {
11931 // For non-global address space preserve the chain and remove the call.
11933 return Op.getOperand(0);
11934 return Op;
11935 }
11936 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11937 SDValue Ops[] = {
11938 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11939 Op.getOperand(3), // offset
11940 Op.getOperand(4), // length
11941 };
11942
11943 MemSDNode *M = cast<MemSDNode>(Op);
11944 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
11945 Op->getVTList(), Ops, M->getMemoryVT(),
11946 M->getMemOperand());
11947 }
11948 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11949 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11950 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11951 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11952 SDValue Chain = Op->getOperand(0);
11953 SDValue Ptr = Op->getOperand(2);
11954 SDValue Val = Op->getOperand(3);
11955 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11956 Ptr, MII->getMemOperand());
11957 }
11958 default: {
11959 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11961 return lowerImage(Op, ImageDimIntr, DAG, true);
11962
11963 return Op;
11964 }
11965 }
11966}
11967
11968// Return whether the operation has NoUnsignedWrap property.
11969static bool isNoUnsignedWrap(SDValue Addr) {
11970 return (Addr.getOpcode() == ISD::ADD &&
11971 Addr->getFlags().hasNoUnsignedWrap()) ||
11972 Addr->getOpcode() == ISD::OR;
11973}
11974
11976 EVT PtrVT) const {
11977 return PtrVT == MVT::i64;
11978}
11979
11981 EVT PtrVT) const {
11982 return true;
11983}
11984
11985// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11986// offset (the offset that is included in bounds checking and swizzling, to be
11987// split between the instruction's voffset and immoffset fields) and soffset
11988// (the offset that is excluded from bounds checking and swizzling, to go in
11989// the instruction's soffset field). This function takes the first kind of
11990// offset and figures out how to split it between voffset and immoffset.
11991std::pair<SDValue, SDValue>
11992SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11993 SDLoc DL(Offset);
11994 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11995 SDValue N0 = Offset;
11996 ConstantSDNode *C1 = nullptr;
11997
11998 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11999 N0 = SDValue();
12000 else if (DAG.isBaseWithConstantOffset(N0)) {
12001 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12002 // being added, so we can only safely match a 32-bit addition with no
12003 // unsigned overflow.
12004 bool CheckNUW = Subtarget->hasGFX1250Insts();
12005 if (!CheckNUW || isNoUnsignedWrap(N0)) {
12006 C1 = cast<ConstantSDNode>(N0.getOperand(1));
12007 N0 = N0.getOperand(0);
12008 }
12009 }
12010
12011 if (C1) {
12012 unsigned ImmOffset = C1->getZExtValue();
12013 // If the immediate value is too big for the immoffset field, put only bits
12014 // that would normally fit in the immoffset field. The remaining value that
12015 // is copied/added for the voffset field is a large power of 2, and it
12016 // stands more chance of being CSEd with the copy/add for another similar
12017 // load/store.
12018 // However, do not do that rounding down if that is a negative
12019 // number, as it appears to be illegal to have a negative offset in the
12020 // vgpr, even if adding the immediate offset makes it positive.
12021 unsigned Overflow = ImmOffset & ~MaxImm;
12022 ImmOffset -= Overflow;
12023 if ((int32_t)Overflow < 0) {
12024 Overflow += ImmOffset;
12025 ImmOffset = 0;
12026 }
12027 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
12028 if (Overflow) {
12029 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
12030 if (!N0)
12031 N0 = OverflowVal;
12032 else {
12033 SDValue Ops[] = {N0, OverflowVal};
12034 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
12035 }
12036 }
12037 }
12038 if (!N0)
12039 N0 = DAG.getConstant(0, DL, MVT::i32);
12040 if (!C1)
12041 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
12042 return {N0, SDValue(C1, 0)};
12043}
12044
12045// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
12046// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
12047// pointed to by Offsets.
12048void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
12049 SelectionDAG &DAG, SDValue *Offsets,
12050 Align Alignment) const {
12051 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12052 SDLoc DL(CombinedOffset);
12053 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
12054 uint32_t Imm = C->getZExtValue();
12055 uint32_t SOffset, ImmOffset;
12056 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12057 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
12058 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12059 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12060 return;
12061 }
12062 }
12063 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
12064 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12065 // being added, so we can only safely match a 32-bit addition with no
12066 // unsigned overflow.
12067 bool CheckNUW = Subtarget->hasGFX1250Insts();
12068 SDValue N0 = CombinedOffset.getOperand(0);
12069 SDValue N1 = CombinedOffset.getOperand(1);
12070 uint32_t SOffset, ImmOffset;
12071 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
12072 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) &&
12073 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
12074 Offsets[0] = N0;
12075 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12076 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12077 return;
12078 }
12079 }
12080
12081 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12082 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
12083 : DAG.getConstant(0, DL, MVT::i32);
12084
12085 Offsets[0] = CombinedOffset;
12086 Offsets[1] = SOffsetZero;
12087 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
12088}
12089
12090SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12091 SelectionDAG &DAG) const {
12092 if (!MaybePointer.getValueType().isScalarInteger())
12093 return MaybePointer;
12094
12095 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
12096 return Rsrc;
12097}
12098
12099// Wrap a global or flat pointer into a buffer intrinsic using the flags
12100// specified in the intrinsic.
12101SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12102 SelectionDAG &DAG) const {
12103 SDLoc Loc(Op);
12104
12105 SDValue Pointer = Op->getOperand(1);
12106 SDValue Stride = Op->getOperand(2);
12107 SDValue NumRecords = Op->getOperand(3);
12108 SDValue Flags = Op->getOperand(4);
12109
12110 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
12111 SDValue Rsrc;
12112
12113 if (Subtarget->has45BitNumRecordsBufferResource()) {
12114 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
12115 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12116 // num_records.
12117 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
12118 SDValue NumRecordsLHS =
12119 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
12120 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
12121 SDValue LowHalf =
12122 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
12123
12124 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12125 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12126 SDValue NumRecordsRHS =
12127 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
12128 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
12129 SDValue ShiftedStride =
12130 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12131 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
12132 SDValue ExtShiftedStrideVec =
12133 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
12134 SDValue ExtShiftedStride =
12135 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
12136 SDValue ShiftedFlags =
12137 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
12138 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
12139 SDValue ExtShiftedFlagsVec =
12140 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
12141 SDValue ExtShiftedFlags =
12142 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
12143 SDValue CombinedFields =
12144 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12145 SDValue HighHalf =
12146 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12147
12148 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
12149 } else {
12150 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
12151 auto [LowHalf, HighHalf] =
12152 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12153 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
12154 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
12155 SDValue ShiftedStride =
12156 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12157 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
12158 SDValue NewHighHalf =
12159 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
12160
12161 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
12162 NumRecords, Flags);
12163 }
12164
12165 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
12166 return RsrcPtr;
12167}
12168
12169// Handle 8 bit and 16 bit buffer loads
12170SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12171 EVT LoadVT, SDLoc DL,
12173 MachineMemOperand *MMO,
12174 bool IsTFE) const {
12175 EVT IntVT = LoadVT.changeTypeToInteger();
12176
12177 if (IsTFE) {
12178 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12179 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12180 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12181 MachineFunction &MF = DAG.getMachineFunction();
12182 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
12183 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
12184 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
12185 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12186 DAG.getConstant(1, DL, MVT::i32));
12187 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12188 DAG.getConstant(0, DL, MVT::i32));
12189 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
12190 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
12191 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
12192 }
12193
12194 unsigned Opc = LoadVT.getScalarType() == MVT::i8
12195 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12196 : AMDGPUISD::BUFFER_LOAD_USHORT;
12197
12198 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
12199 SDValue BufferLoad =
12200 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
12201 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
12202 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
12203
12204 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
12205}
12206
12207// Handle 8 bit and 16 bit buffer stores
12208SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12209 EVT VDataType, SDLoc DL,
12210 SDValue Ops[],
12211 MemSDNode *M) const {
12212 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12213 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
12214
12215 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
12216 Ops[1] = BufferStoreExt;
12217 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12218 : AMDGPUISD::BUFFER_STORE_SHORT;
12219 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
12220 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
12221 M->getMemOperand());
12222}
12223
12225 SDValue Op, const SDLoc &SL, EVT VT) {
12226 if (VT.bitsLT(Op.getValueType()))
12227 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
12228
12229 switch (ExtType) {
12230 case ISD::SEXTLOAD:
12231 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
12232 case ISD::ZEXTLOAD:
12233 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
12234 case ISD::EXTLOAD:
12235 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
12236 case ISD::NON_EXTLOAD:
12237 return Op;
12238 }
12239
12240 llvm_unreachable("invalid ext type");
12241}
12242
12243// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
12244// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
12245SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
12246 DAGCombinerInfo &DCI) const {
12247 SelectionDAG &DAG = DCI.DAG;
12248 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
12249 return SDValue();
12250
12251 // FIXME: Constant loads should all be marked invariant.
12252 unsigned AS = Ld->getAddressSpace();
12253 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
12255 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
12256 return SDValue();
12257
12258 // Don't do this early, since it may interfere with adjacent load merging for
12259 // illegal types. We can avoid losing alignment information for exotic types
12260 // pre-legalize.
12261 EVT MemVT = Ld->getMemoryVT();
12262 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
12263 MemVT.getSizeInBits() >= 32)
12264 return SDValue();
12265
12266 SDLoc SL(Ld);
12267
12268 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
12269 "unexpected vector extload");
12270
12271 // TODO: Drop only high part of range.
12272 SDValue Ptr = Ld->getBasePtr();
12273 SDValue NewLoad = DAG.getLoad(
12274 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
12275 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
12276 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
12277 nullptr); // Drop ranges
12278
12279 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
12280 if (MemVT.isFloatingPoint()) {
12282 "unexpected fp extload");
12283 TruncVT = MemVT.changeTypeToInteger();
12284 }
12285
12286 SDValue Cvt = NewLoad;
12287 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12288 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
12289 DAG.getValueType(TruncVT));
12290 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12292 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
12293 } else {
12295 }
12296
12297 EVT VT = Ld->getValueType(0);
12298 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
12299
12300 DCI.AddToWorklist(Cvt.getNode());
12301
12302 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12303 // the appropriate extension from the 32-bit load.
12304 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
12305 DCI.AddToWorklist(Cvt.getNode());
12306
12307 // Handle conversion back to floating point if necessary.
12308 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
12309
12310 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
12311}
12312
12314 const SIMachineFunctionInfo &Info) {
12315 // TODO: Should check if the address can definitely not access stack.
12316 if (Info.isEntryFunction())
12317 return Info.getUserSGPRInfo().hasFlatScratchInit();
12318 return true;
12319}
12320
12321SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12322 SDLoc DL(Op);
12323 LoadSDNode *Load = cast<LoadSDNode>(Op);
12324 ISD::LoadExtType ExtType = Load->getExtensionType();
12325 EVT MemVT = Load->getMemoryVT();
12326 MachineMemOperand *MMO = Load->getMemOperand();
12327
12328 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12329 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
12330 return SDValue();
12331
12332 // FIXME: Copied from PPC
12333 // First, load into 32 bits, then truncate to 1 bit.
12334
12335 SDValue Chain = Load->getChain();
12336 SDValue BasePtr = Load->getBasePtr();
12337
12338 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12339
12340 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
12341 RealMemVT, MMO);
12342
12343 if (!MemVT.isVector()) {
12344 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
12345 NewLD.getValue(1)};
12346
12347 return DAG.getMergeValues(Ops, DL);
12348 }
12349
12351 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12352 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
12353 DAG.getConstant(I, DL, MVT::i32));
12354
12355 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12356 }
12357
12358 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12359
12360 return DAG.getMergeValues(Ops, DL);
12361 }
12362
12363 if (!MemVT.isVector())
12364 return SDValue();
12365
12366 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12367 "Custom lowering for non-i32 vectors hasn't been implemented.");
12368
12369 Align Alignment = Load->getAlign();
12370 unsigned AS = Load->getAddressSpace();
12371 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12372 AS == AMDGPUAS::FLAT_ADDRESS &&
12373 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12374 return SplitVectorLoad(Op, DAG);
12375 }
12376
12377 MachineFunction &MF = DAG.getMachineFunction();
12378 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12379 // If there is a possibility that flat instruction access scratch memory
12380 // then we need to use the same legalization rules we use for private.
12381 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12382 !Subtarget->hasMultiDwordFlatScratchAddressing())
12383 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12386
12387 unsigned NumElements = MemVT.getVectorNumElements();
12388
12389 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12391 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12392 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12393 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12394 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12395 Alignment >= Align(4) && NumElements < 32) {
12396 if (MemVT.isPow2VectorType() ||
12397 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12398 return SDValue();
12399 return WidenOrSplitVectorLoad(Op, DAG);
12400 }
12401 // Non-uniform loads will be selected to MUBUF instructions, so they
12402 // have the same legalization requirements as global and private
12403 // loads.
12404 //
12405 }
12406 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12409 if (NumElements > 4)
12410 return SplitVectorLoad(Op, DAG);
12411 // v3 loads not supported on SI.
12412 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12413 return WidenOrSplitVectorLoad(Op, DAG);
12414
12415 // v3 and v4 loads are supported for private and global memory.
12416 return SDValue();
12417 }
12418 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12419 // Depending on the setting of the private_element_size field in the
12420 // resource descriptor, we can only make private accesses up to a certain
12421 // size.
12422 switch (Subtarget->getMaxPrivateElementSize()) {
12423 case 4: {
12424 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12425 return DAG.getMergeValues({Op0, Op1}, DL);
12426 }
12427 case 8:
12428 if (NumElements > 2)
12429 return SplitVectorLoad(Op, DAG);
12430 return SDValue();
12431 case 16:
12432 // Same as global/flat
12433 if (NumElements > 4)
12434 return SplitVectorLoad(Op, DAG);
12435 // v3 loads not supported on SI.
12436 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12437 return WidenOrSplitVectorLoad(Op, DAG);
12438
12439 return SDValue();
12440 default:
12441 llvm_unreachable("unsupported private_element_size");
12442 }
12443 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12444 unsigned Fast = 0;
12445 auto Flags = Load->getMemOperand()->getFlags();
12447 Load->getAlign(), Flags, &Fast) &&
12448 Fast > 1)
12449 return SDValue();
12450
12451 if (MemVT.isVector())
12452 return SplitVectorLoad(Op, DAG);
12453 }
12454
12456 MemVT, *Load->getMemOperand())) {
12457 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12458 return DAG.getMergeValues({Op0, Op1}, DL);
12459 }
12460
12461 return SDValue();
12462}
12463
12464SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12465 EVT VT = Op.getValueType();
12466 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12467 VT.getSizeInBits() == 512)
12468 return splitTernaryVectorOp(Op, DAG);
12469
12470 assert(VT.getSizeInBits() == 64);
12471
12472 SDLoc DL(Op);
12473 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12474
12475 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12476 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12477
12478 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12479 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12480
12481 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12482 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12483
12484 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12485
12486 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12487 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12488
12489 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12490
12491 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12492 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12493}
12494
12495// Catch division cases where we can use shortcuts with rcp and rsq
12496// instructions.
12497SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12498 SelectionDAG &DAG) const {
12499 SDLoc SL(Op);
12500 SDValue LHS = Op.getOperand(0);
12501 SDValue RHS = Op.getOperand(1);
12502 EVT VT = Op.getValueType();
12503 const SDNodeFlags Flags = Op->getFlags();
12504
12505 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12506
12507 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12508 // Without !fpmath accuracy information, we can't do more because we don't
12509 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12510 // f16 is always accurate enough
12511 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12512 return SDValue();
12513
12514 if (CLHS->isExactlyValue(1.0)) {
12515 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12516 // the CI documentation has a worst case error of 1 ulp.
12517 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12518 // use it as long as we aren't trying to use denormals.
12519 //
12520 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12521
12522 // 1.0 / sqrt(x) -> rsq(x)
12523
12524 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12525 // error seems really high at 2^29 ULP.
12526 // 1.0 / x -> rcp(x)
12527 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12528 }
12529
12530 // Same as for 1.0, but expand the sign out of the constant.
12531 if (CLHS->isExactlyValue(-1.0)) {
12532 // -1.0 / x -> rcp (fneg x)
12533 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12534 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12535 }
12536 }
12537
12538 // For f16 and bf16 require afn or arcp.
12539 // For f32 require afn.
12540 if (!AllowInaccurateRcp &&
12541 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12542 return SDValue();
12543
12544 // Turn into multiply by the reciprocal.
12545 // x / y -> x * (1.0 / y)
12546 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12547 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12548}
12549
12550SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12551 SelectionDAG &DAG) const {
12552 SDLoc SL(Op);
12553 SDValue X = Op.getOperand(0);
12554 SDValue Y = Op.getOperand(1);
12555 EVT VT = Op.getValueType();
12556 const SDNodeFlags Flags = Op->getFlags();
12557
12558 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12559 if (!AllowInaccurateDiv)
12560 return SDValue();
12561
12562 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12563 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12564
12565 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12566 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12567
12568 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12569 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12570 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12571 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12572 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12573 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12574}
12575
12576static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12577 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12578 SDNodeFlags Flags) {
12579 if (GlueChain->getNumValues() <= 1) {
12580 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12581 }
12582
12583 assert(GlueChain->getNumValues() == 3);
12584
12585 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12586 switch (Opcode) {
12587 default:
12588 llvm_unreachable("no chain equivalent for opcode");
12589 case ISD::FMUL:
12590 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12591 break;
12592 }
12593
12594 return DAG.getNode(Opcode, SL, VTList,
12595 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12596 Flags);
12597}
12598
12599static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12600 EVT VT, SDValue A, SDValue B, SDValue C,
12601 SDValue GlueChain, SDNodeFlags Flags) {
12602 if (GlueChain->getNumValues() <= 1) {
12603 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12604 }
12605
12606 assert(GlueChain->getNumValues() == 3);
12607
12608 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12609 switch (Opcode) {
12610 default:
12611 llvm_unreachable("no chain equivalent for opcode");
12612 case ISD::FMA:
12613 Opcode = AMDGPUISD::FMA_W_CHAIN;
12614 break;
12615 }
12616
12617 return DAG.getNode(Opcode, SL, VTList,
12618 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12619 Flags);
12620}
12621
12622SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12623 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12624 return FastLowered;
12625
12626 SDLoc SL(Op);
12627 EVT VT = Op.getValueType();
12628 SDValue LHS = Op.getOperand(0);
12629 SDValue RHS = Op.getOperand(1);
12630
12631 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12632 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12633
12634 if (VT == MVT::bf16) {
12635 SDValue ExtDiv =
12636 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12637 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12638 DAG.getTargetConstant(0, SL, MVT::i32));
12639 }
12640
12641 assert(VT == MVT::f16);
12642
12643 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12644 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12645 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12646 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12647 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12648 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12649 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12650 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12651 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12652 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12653 // q16.u = opx(V_CVT_F16_F32, q32.u);
12654 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12655
12656 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12657 unsigned FMADOpCode =
12659 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12660 SDValue Rcp =
12661 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12662 SDValue Quot =
12663 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12664 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12665 Op->getFlags());
12666 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12667 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12668 Op->getFlags());
12669 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12670 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12671 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12672 DAG.getConstant(0xff800000, SL, MVT::i32));
12673 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12674 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12675 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12676 DAG.getTargetConstant(0, SL, MVT::i32));
12677 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12678 Op->getFlags());
12679}
12680
12681// Faster 2.5 ULP division that does not support denormals.
12682SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12683 SDNodeFlags Flags = Op->getFlags();
12684 SDLoc SL(Op);
12685 SDValue LHS = Op.getOperand(1);
12686 SDValue RHS = Op.getOperand(2);
12687
12688 // TODO: The combiner should probably handle elimination of redundant fabs.
12690 ? RHS
12691 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12692
12693 const APFloat K0Val(0x1p+96f);
12694 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12695
12696 const APFloat K1Val(0x1p-32f);
12697 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12698
12699 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12700
12701 EVT SetCCVT =
12702 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12703
12704 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12705
12706 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12707
12708 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12709
12710 // rcp does not support denormals.
12711 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12712
12713 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12714
12715 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12716}
12717
12718// Returns immediate value for setting the F32 denorm mode when using the
12719// S_DENORM_MODE instruction.
12721 const SIMachineFunctionInfo *Info,
12722 const GCNSubtarget *ST) {
12723 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12724 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12725 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12726 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12727}
12728
12729SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12730 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12731 return FastLowered;
12732
12733 // The selection matcher assumes anything with a chain selecting to a
12734 // mayRaiseFPException machine instruction. Since we're introducing a chain
12735 // here, we need to explicitly report nofpexcept for the regular fdiv
12736 // lowering.
12737 SDNodeFlags Flags = Op->getFlags();
12738 Flags.setNoFPExcept(true);
12739
12740 SDLoc SL(Op);
12741 SDValue LHS = Op.getOperand(0);
12742 SDValue RHS = Op.getOperand(1);
12743
12744 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12745
12746 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12747
12748 SDValue DenominatorScaled =
12749 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12750 SDValue NumeratorScaled =
12751 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12752
12753 // Denominator is scaled to not be denormal, so using rcp is ok.
12754 SDValue ApproxRcp =
12755 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12756 SDValue NegDivScale0 =
12757 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12758
12759 using namespace AMDGPU::Hwreg;
12760 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12761 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12762
12763 const MachineFunction &MF = DAG.getMachineFunction();
12764 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12765 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12766
12767 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12768 const bool HasDynamicDenormals =
12769 (DenormMode.Input == DenormalMode::Dynamic) ||
12770 (DenormMode.Output == DenormalMode::Dynamic);
12771
12772 SDValue SavedDenormMode;
12773
12774 if (!PreservesDenormals) {
12775 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12776 // lowering. The chain dependence is insufficient, and we need glue. We do
12777 // not need the glue variants in a strictfp function.
12778
12779 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12780
12781 SDValue Glue = DAG.getEntryNode();
12782 if (HasDynamicDenormals) {
12783 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12784 DAG.getVTList(MVT::i32, MVT::Glue),
12785 {BitField, Glue});
12786 SavedDenormMode = SDValue(GetReg, 0);
12787
12788 Glue = DAG.getMergeValues(
12789 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12790 }
12791
12792 SDNode *EnableDenorm;
12793 if (Subtarget->hasDenormModeInst()) {
12794 const SDValue EnableDenormValue =
12795 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
12796
12797 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12798 EnableDenormValue)
12799 .getNode();
12800 } else {
12801 const SDValue EnableDenormValue =
12802 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12803 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12804 {EnableDenormValue, BitField, Glue});
12805 }
12806
12807 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12808 SDValue(EnableDenorm, 1)};
12809
12810 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12811 }
12812
12813 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12814 ApproxRcp, One, NegDivScale0, Flags);
12815
12816 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12817 ApproxRcp, Fma0, Flags);
12818
12819 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12820 Fma1, Flags);
12821
12822 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12823 NumeratorScaled, Mul, Flags);
12824
12825 SDValue Fma3 =
12826 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12827
12828 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12829 NumeratorScaled, Fma3, Flags);
12830
12831 if (!PreservesDenormals) {
12832 SDNode *DisableDenorm;
12833 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12834 const SDValue DisableDenormValue = getSPDenormModeValue(
12835 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12836
12837 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12838 DisableDenorm =
12839 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12840 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12841 .getNode();
12842 } else {
12843 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12844 const SDValue DisableDenormValue =
12845 HasDynamicDenormals
12846 ? SavedDenormMode
12847 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12848
12849 DisableDenorm = DAG.getMachineNode(
12850 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12851 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12852 }
12853
12854 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12855 SDValue(DisableDenorm, 0), DAG.getRoot());
12856 DAG.setRoot(OutputChain);
12857 }
12858
12859 SDValue Scale = NumeratorScaled.getValue(1);
12860 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12861 {Fma4, Fma1, Fma3, Scale}, Flags);
12862
12863 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12864}
12865
12866SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12867 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12868 return FastLowered;
12869
12870 SDLoc SL(Op);
12871 SDValue X = Op.getOperand(0);
12872 SDValue Y = Op.getOperand(1);
12873
12874 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12875
12876 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12877
12878 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12879
12880 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12881
12882 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12883
12884 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12885
12886 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12887
12888 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12889
12890 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12891
12892 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12893 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12894
12895 SDValue Fma4 =
12896 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12897
12898 SDValue Scale;
12899
12900 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12901 // Workaround a hardware bug on SI where the condition output from div_scale
12902 // is not usable.
12903
12904 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12905
12906 // Figure out if the scale to use for div_fmas.
12907 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12908 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12909 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12910 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12911
12912 SDValue NumHi =
12913 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12914 SDValue DenHi =
12915 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12916
12917 SDValue Scale0Hi =
12918 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12919 SDValue Scale1Hi =
12920 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12921
12922 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12923 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12924 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12925 } else {
12926 Scale = DivScale1.getValue(1);
12927 }
12928
12929 SDValue Fmas =
12930 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12931
12932 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12933}
12934
12935SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12936 EVT VT = Op.getValueType();
12937
12938 if (VT == MVT::f32)
12939 return LowerFDIV32(Op, DAG);
12940
12941 if (VT == MVT::f64)
12942 return LowerFDIV64(Op, DAG);
12943
12944 if (VT == MVT::f16 || VT == MVT::bf16)
12945 return LowerFDIV16(Op, DAG);
12946
12947 llvm_unreachable("Unexpected type for fdiv");
12948}
12949
12950SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12951 SDLoc dl(Op);
12952 SDValue Val = Op.getOperand(0);
12953 EVT VT = Val.getValueType();
12954 EVT ResultExpVT = Op->getValueType(1);
12955 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12956
12957 SDValue Mant = DAG.getNode(
12959 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12960
12961 SDValue Exp = DAG.getNode(
12962 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12963 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12964
12965 if (Subtarget->hasFractBug()) {
12966 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12967 SDValue Inf =
12969
12970 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12971 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12972 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12973 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12974 }
12975
12976 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12977 return DAG.getMergeValues({Mant, CastExp}, dl);
12978}
12979
12980SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12981 SDLoc DL(Op);
12982 StoreSDNode *Store = cast<StoreSDNode>(Op);
12983 EVT VT = Store->getMemoryVT();
12984
12985 if (VT == MVT::i1) {
12986 return DAG.getTruncStore(
12987 Store->getChain(), DL,
12988 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12989 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12990 }
12991
12992 assert(VT.isVector() &&
12993 Store->getValue().getValueType().getScalarType() == MVT::i32);
12994
12995 unsigned AS = Store->getAddressSpace();
12996 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12997 AS == AMDGPUAS::FLAT_ADDRESS &&
12998 Store->getAlign().value() < VT.getStoreSize() &&
12999 VT.getSizeInBits() > 32) {
13000 return SplitVectorStore(Op, DAG);
13001 }
13002
13003 MachineFunction &MF = DAG.getMachineFunction();
13004 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
13005 // If there is a possibility that flat instruction access scratch memory
13006 // then we need to use the same legalization rules we use for private.
13007 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13008 !Subtarget->hasMultiDwordFlatScratchAddressing())
13009 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
13012
13013 unsigned NumElements = VT.getVectorNumElements();
13015 if (NumElements > 4)
13016 return SplitVectorStore(Op, DAG);
13017 // v3 stores not supported on SI.
13018 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13019 return SplitVectorStore(Op, DAG);
13020
13022 VT, *Store->getMemOperand()))
13023 return expandUnalignedStore(Store, DAG);
13024
13025 return SDValue();
13026 }
13027 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13028 switch (Subtarget->getMaxPrivateElementSize()) {
13029 case 4:
13030 return scalarizeVectorStore(Store, DAG);
13031 case 8:
13032 if (NumElements > 2)
13033 return SplitVectorStore(Op, DAG);
13034 return SDValue();
13035 case 16:
13036 if (NumElements > 4 ||
13037 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13038 return SplitVectorStore(Op, DAG);
13039 return SDValue();
13040 default:
13041 llvm_unreachable("unsupported private_element_size");
13042 }
13043 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13044 unsigned Fast = 0;
13045 auto Flags = Store->getMemOperand()->getFlags();
13047 Store->getAlign(), Flags, &Fast) &&
13048 Fast > 1)
13049 return SDValue();
13050
13051 if (VT.isVector())
13052 return SplitVectorStore(Op, DAG);
13053
13054 return expandUnalignedStore(Store, DAG);
13055 }
13056
13057 // Probably an invalid store. If so we'll end up emitting a selection error.
13058 return SDValue();
13059}
13060
13061// Avoid the full correct expansion for f32 sqrt when promoting from f16.
13062SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
13063 SDLoc SL(Op);
13064 assert(!Subtarget->has16BitInsts());
13065 SDNodeFlags Flags = Op->getFlags();
13066 SDValue Ext =
13067 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
13068
13069 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
13070 SDValue Sqrt =
13071 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
13072
13073 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
13074 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
13075}
13076
13077SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13078 SDLoc DL(Op);
13079 SDNodeFlags Flags = Op->getFlags();
13080 MVT VT = Op.getValueType().getSimpleVT();
13081 const SDValue X = Op.getOperand(0);
13082
13083 if (allowApproxFunc(DAG, Flags)) {
13084 // Instruction is 1ulp but ignores denormals.
13085 return DAG.getNode(
13087 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
13088 }
13089
13090 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
13091 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
13092
13093 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
13094
13095 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
13096
13097 SDValue SqrtX =
13098 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
13099
13100 SDValue SqrtS;
13101 if (needsDenormHandlingF32(DAG, X, Flags)) {
13102 SDValue SqrtID =
13103 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
13104 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
13105
13106 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
13107 SDValue SqrtSNextDownInt =
13108 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13109 DAG.getAllOnesConstant(DL, MVT::i32));
13110 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
13111
13112 SDValue NegSqrtSNextDown =
13113 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
13114
13115 SDValue SqrtVP =
13116 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
13117
13118 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13119 DAG.getConstant(1, DL, MVT::i32));
13120 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
13121
13122 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
13123 SDValue SqrtVS =
13124 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
13125
13126 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
13127 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
13128
13129 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
13130 Flags);
13131
13132 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
13133 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
13134 Flags);
13135 } else {
13136 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
13137
13138 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
13139
13140 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
13141 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
13142 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
13143
13144 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
13145 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
13146 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
13147
13148 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
13149 SDValue SqrtD =
13150 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
13151 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
13152 }
13153
13154 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
13155
13156 SDValue ScaledDown =
13157 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
13158
13159 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
13160 SDValue IsZeroOrInf =
13161 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13162 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13163
13164 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
13165}
13166
13167SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13168 // For double type, the SQRT and RSQ instructions don't have required
13169 // precision, we apply Goldschmidt's algorithm to improve the result:
13170 //
13171 // y0 = rsq(x)
13172 // g0 = x * y0
13173 // h0 = 0.5 * y0
13174 //
13175 // r0 = 0.5 - h0 * g0
13176 // g1 = g0 * r0 + g0
13177 // h1 = h0 * r0 + h0
13178 //
13179 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
13180 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
13181 // h2 = h1 * r1 + h1
13182 //
13183 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
13184 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
13185 //
13186 // sqrt(x) = g3
13187
13188 SDNodeFlags Flags = Op->getFlags();
13189
13190 SDLoc DL(Op);
13191
13192 SDValue X = Op.getOperand(0);
13193 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
13194
13195 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
13196
13197 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
13198
13199 // Scale up input if it is too small.
13200 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
13201 SDValue ScaleUp =
13202 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
13203 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
13204
13205 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
13206
13207 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
13208
13209 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
13210 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
13211
13212 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
13213 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
13214
13215 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
13216
13217 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
13218
13219 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
13220 SDValue SqrtD0 =
13221 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
13222
13223 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
13224
13225 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
13226 SDValue SqrtD1 =
13227 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
13228
13229 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
13230
13231 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
13232 SDValue ScaleDown =
13233 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
13234 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
13235
13236 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
13237 // with finite only or nsz because rsq(+/-0) = +/-inf
13238
13239 // TODO: Check for DAZ and expand to subnormals
13240 SDValue IsZeroOrInf =
13241 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13242 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13243
13244 // If x is +INF, +0, or -0, use its original value
13245 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
13246 Flags);
13247}
13248
13249SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
13250 SDLoc DL(Op);
13251 EVT VT = Op.getValueType();
13252 SDValue Arg = Op.getOperand(0);
13253 SDValue TrigVal;
13254
13255 // Propagate fast-math flags so that the multiply we introduce can be folded
13256 // if Arg is already the result of a multiply by constant.
13257 auto Flags = Op->getFlags();
13258
13259 // AMDGPUISD nodes of vector type must be unrolled here since
13260 // they will not be expanded elsewhere.
13261 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
13262 if (!V.getValueType().isVector())
13263 return V;
13264
13265 return DAG.UnrollVectorOp(cast<SDNode>(V));
13266 };
13267
13268 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
13269
13270 if (Subtarget->hasTrigReducedRange()) {
13271 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13272 TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));
13273 } else {
13274 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13275 }
13276
13277 switch (Op.getOpcode()) {
13278 case ISD::FCOS:
13279 TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
13280 break;
13281 case ISD::FSIN:
13282 TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
13283 break;
13284 default:
13285 llvm_unreachable("Wrong trig opcode");
13286 }
13287
13288 return UnrollIfVec(TrigVal);
13289}
13290
13291SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13292 SelectionDAG &DAG) const {
13293 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
13294 assert(AtomicNode->isCompareAndSwap());
13295 unsigned AS = AtomicNode->getAddressSpace();
13296
13297 // No custom lowering required for local address space
13299 return Op;
13300
13301 // Non-local address space requires custom lowering for atomic compare
13302 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13303 SDLoc DL(Op);
13304 SDValue ChainIn = Op.getOperand(0);
13305 SDValue Addr = Op.getOperand(1);
13306 SDValue Old = Op.getOperand(2);
13307 SDValue New = Op.getOperand(3);
13308 EVT VT = Op.getValueType();
13309 MVT SimpleVT = VT.getSimpleVT();
13310 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
13311
13312 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
13313 SDValue Ops[] = {ChainIn, Addr, NewOld};
13314
13315 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
13316 Op->getVTList(), Ops, VT,
13317 AtomicNode->getMemOperand());
13318}
13319
13320//===----------------------------------------------------------------------===//
13321// Custom DAG optimizations
13322//===----------------------------------------------------------------------===//
13323
13324SDValue
13325SITargetLowering::performUCharToFloatCombine(SDNode *N,
13326 DAGCombinerInfo &DCI) const {
13327 EVT VT = N->getValueType(0);
13328 EVT ScalarVT = VT.getScalarType();
13329 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13330 return SDValue();
13331
13332 SelectionDAG &DAG = DCI.DAG;
13333 SDLoc DL(N);
13334
13335 SDValue Src = N->getOperand(0);
13336 EVT SrcVT = Src.getValueType();
13337
13338 // TODO: We could try to match extracting the higher bytes, which would be
13339 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13340 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13341 // about in practice.
13342 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13343 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
13344 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
13345 DCI.AddToWorklist(Cvt.getNode());
13346
13347 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13348 if (ScalarVT != MVT::f32) {
13349 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
13350 DAG.getTargetConstant(0, DL, MVT::i32));
13351 }
13352 return Cvt;
13353 }
13354 }
13355
13356 return SDValue();
13357}
13358
13359SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13360 DAGCombinerInfo &DCI) const {
13361 SDValue MagnitudeOp = N->getOperand(0);
13362 SDValue SignOp = N->getOperand(1);
13363
13364 // The generic combine for fcopysign + fp cast is too conservative with
13365 // vectors, and also gets confused by the splitting we will perform here, so
13366 // peek through FP casts.
13367 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13368 SignOp.getOpcode() == ISD::FP_ROUND)
13369 SignOp = SignOp.getOperand(0);
13370
13371 SelectionDAG &DAG = DCI.DAG;
13372 SDLoc DL(N);
13373 EVT SignVT = SignOp.getValueType();
13374
13375 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13376 // lower half with a copy.
13377 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13378 EVT MagVT = MagnitudeOp.getValueType();
13379
13380 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13381
13382 if (MagVT.getScalarType() == MVT::f64) {
13383 EVT F32VT = MagVT.isVector()
13384 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13385 : MVT::v2f32;
13386
13387 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13388
13390 for (unsigned I = 0; I != NumElts; ++I) {
13391 SDValue MagLo =
13392 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13393 DAG.getConstant(2 * I, DL, MVT::i32));
13394 SDValue MagHi =
13395 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13396 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13397
13398 SDValue SignOpElt =
13399 MagVT.isVector()
13401 SignOp, DAG.getConstant(I, DL, MVT::i32))
13402 : SignOp;
13403
13404 SDValue HiOp =
13405 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13406
13407 SDValue Vector =
13408 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13409
13410 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13411 NewElts.push_back(NewElt);
13412 }
13413
13414 if (NewElts.size() == 1)
13415 return NewElts[0];
13416
13417 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13418 }
13419
13420 if (SignVT.getScalarType() != MVT::f64)
13421 return SDValue();
13422
13423 // Reduce width of sign operand, we only need the highest bit.
13424 //
13425 // fcopysign f64:x, f64:y ->
13426 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13427 // TODO: In some cases it might make sense to go all the way to f16.
13428
13429 EVT F32VT = MagVT.isVector()
13430 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13431 : MVT::v2f32;
13432
13433 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13434
13435 SmallVector<SDValue, 8> F32Signs;
13436 for (unsigned I = 0; I != NumElts; ++I) {
13437 // Take sign from odd elements of cast vector
13438 SDValue SignAsF32 =
13439 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13440 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13441 F32Signs.push_back(SignAsF32);
13442 }
13443
13444 SDValue NewSign =
13445 NumElts == 1
13446 ? F32Signs.back()
13448 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13449 F32Signs);
13450
13451 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13452 NewSign);
13453}
13454
13455// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13456// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13457// bits
13458
13459// This is a variant of
13460// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13461//
13462// The normal DAG combiner will do this, but only if the add has one use since
13463// that would increase the number of instructions.
13464//
13465// This prevents us from seeing a constant offset that can be folded into a
13466// memory instruction's addressing mode. If we know the resulting add offset of
13467// a pointer can be folded into an addressing offset, we can replace the pointer
13468// operand with the add of new constant offset. This eliminates one of the uses,
13469// and may allow the remaining use to also be simplified.
13470//
13471SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13472 EVT MemVT,
13473 DAGCombinerInfo &DCI) const {
13474 SDValue N0 = N->getOperand(0);
13475 SDValue N1 = N->getOperand(1);
13476
13477 // We only do this to handle cases where it's profitable when there are
13478 // multiple uses of the add, so defer to the standard combine.
13479 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13480 return SDValue();
13481
13482 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13483 if (!CN1)
13484 return SDValue();
13485
13486 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13487 if (!CAdd)
13488 return SDValue();
13489
13490 SelectionDAG &DAG = DCI.DAG;
13491
13492 if (N0->getOpcode() == ISD::OR &&
13493 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13494 return SDValue();
13495
13496 // If the resulting offset is too large, we can't fold it into the
13497 // addressing mode offset.
13498 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13499 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13500
13501 AddrMode AM;
13502 AM.HasBaseReg = true;
13503 AM.BaseOffs = Offset.getSExtValue();
13504 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13505 return SDValue();
13506
13507 SDLoc SL(N);
13508 EVT VT = N->getValueType(0);
13509
13510 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13511 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13512
13513 SDNodeFlags Flags;
13514 Flags.setNoUnsignedWrap(
13515 N->getFlags().hasNoUnsignedWrap() &&
13516 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13517
13518 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13519 // be sure that the new left operand is a proper base pointer.
13520 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13521}
13522
13523/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13524/// by the chain and intrinsic ID. Theoretically we would also need to check the
13525/// specific intrinsic, but they all place the pointer operand first.
13526static unsigned getBasePtrIndex(const MemSDNode *N) {
13527 switch (N->getOpcode()) {
13528 case ISD::STORE:
13531 return 2;
13532 default:
13533 return 1;
13534 }
13535}
13536
13537SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13538 DAGCombinerInfo &DCI) const {
13539 SelectionDAG &DAG = DCI.DAG;
13540
13541 unsigned PtrIdx = getBasePtrIndex(N);
13542 SDValue Ptr = N->getOperand(PtrIdx);
13543
13544 // TODO: We could also do this for multiplies.
13545 if (Ptr.getOpcode() == ISD::SHL) {
13546 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13547 N->getMemoryVT(), DCI);
13548 if (NewPtr) {
13549 SmallVector<SDValue, 8> NewOps(N->ops());
13550
13551 NewOps[PtrIdx] = NewPtr;
13552 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13553 }
13554 }
13555
13556 return SDValue();
13557}
13558
13559static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13560 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13561 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13562 (Opc == ISD::XOR && Val == 0);
13563}
13564
13565// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13566// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13567// integer combine opportunities since most 64-bit operations are decomposed
13568// this way. TODO: We won't want this for SALU especially if it is an inline
13569// immediate.
13570SDValue SITargetLowering::splitBinaryBitConstantOp(
13571 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13572 const ConstantSDNode *CRHS) const {
13573 uint64_t Val = CRHS->getZExtValue();
13574 uint32_t ValLo = Lo_32(Val);
13575 uint32_t ValHi = Hi_32(Val);
13576 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13577
13578 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13580 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13581 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13582 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13583 !CRHS->user_begin()->isDivergent())
13584 return SDValue();
13585
13586 // If we need to materialize a 64-bit immediate, it will be split up later
13587 // anyway. Avoid creating the harder to understand 64-bit immediate
13588 // materialization.
13589 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13590 }
13591
13592 return SDValue();
13593}
13594
13596 if (V.getValueType() != MVT::i1)
13597 return false;
13598 switch (V.getOpcode()) {
13599 default:
13600 break;
13601 case ISD::SETCC:
13602 case ISD::IS_FPCLASS:
13603 case AMDGPUISD::FP_CLASS:
13604 return true;
13605 case ISD::AND:
13606 case ISD::OR:
13607 case ISD::XOR:
13608 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13609 case ISD::SADDO:
13610 case ISD::UADDO:
13611 case ISD::SSUBO:
13612 case ISD::USUBO:
13613 case ISD::SMULO:
13614 case ISD::UMULO:
13615 return V.getResNo() == 1;
13617 unsigned IntrinsicID = V.getConstantOperandVal(0);
13618 switch (IntrinsicID) {
13619 case Intrinsic::amdgcn_is_shared:
13620 case Intrinsic::amdgcn_is_private:
13621 return true;
13622 default:
13623 return false;
13624 }
13625
13626 return false;
13627 }
13628 }
13629 return false;
13630}
13631
13632// If a constant has all zeroes or all ones within each byte return it.
13633// Otherwise return 0.
13635 // 0xff for any zero byte in the mask
13636 uint32_t ZeroByteMask = 0;
13637 if (!(C & 0x000000ff))
13638 ZeroByteMask |= 0x000000ff;
13639 if (!(C & 0x0000ff00))
13640 ZeroByteMask |= 0x0000ff00;
13641 if (!(C & 0x00ff0000))
13642 ZeroByteMask |= 0x00ff0000;
13643 if (!(C & 0xff000000))
13644 ZeroByteMask |= 0xff000000;
13645 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13646 if ((NonZeroByteMask & C) != NonZeroByteMask)
13647 return 0; // Partial bytes selected.
13648 return C;
13649}
13650
13651// Check if a node selects whole bytes from its operand 0 starting at a byte
13652// boundary while masking the rest. Returns select mask as in the v_perm_b32
13653// or -1 if not succeeded.
13654// Note byte select encoding:
13655// value 0-3 selects corresponding source byte;
13656// value 0xc selects zero;
13657// value 0xff selects 0xff.
13659 assert(V.getValueSizeInBits() == 32);
13660
13661 if (V.getNumOperands() != 2)
13662 return ~0;
13663
13664 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13665 if (!N1)
13666 return ~0;
13667
13668 uint32_t C = N1->getZExtValue();
13669
13670 switch (V.getOpcode()) {
13671 default:
13672 break;
13673 case ISD::AND:
13674 if (uint32_t ConstMask = getConstantPermuteMask(C))
13675 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13676 break;
13677
13678 case ISD::OR:
13679 if (uint32_t ConstMask = getConstantPermuteMask(C))
13680 return (0x03020100 & ~ConstMask) | ConstMask;
13681 break;
13682
13683 case ISD::SHL:
13684 if (C % 8)
13685 return ~0;
13686
13687 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13688
13689 case ISD::SRL:
13690 if (C % 8)
13691 return ~0;
13692
13693 return uint32_t(0x0c0c0c0c03020100ull >> C);
13694 }
13695
13696 return ~0;
13697}
13698
13699SDValue SITargetLowering::performAndCombine(SDNode *N,
13700 DAGCombinerInfo &DCI) const {
13701 if (DCI.isBeforeLegalize())
13702 return SDValue();
13703
13704 SelectionDAG &DAG = DCI.DAG;
13705 EVT VT = N->getValueType(0);
13706 SDValue LHS = N->getOperand(0);
13707 SDValue RHS = N->getOperand(1);
13708
13709 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13710 if (VT == MVT::i64 && CRHS) {
13711 if (SDValue Split =
13712 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13713 return Split;
13714 }
13715
13716 if (CRHS && VT == MVT::i32) {
13717 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13718 // nb = number of trailing zeroes in mask
13719 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13720 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13721 uint64_t Mask = CRHS->getZExtValue();
13722 unsigned Bits = llvm::popcount(Mask);
13723 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13724 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13725 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13726 unsigned Shift = CShift->getZExtValue();
13727 unsigned NB = CRHS->getAPIntValue().countr_zero();
13728 unsigned Offset = NB + Shift;
13729 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13730 SDLoc SL(N);
13731 SDValue BFE =
13732 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13733 DAG.getConstant(Offset, SL, MVT::i32),
13734 DAG.getConstant(Bits, SL, MVT::i32));
13735 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13736 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13737 DAG.getValueType(NarrowVT));
13738 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13739 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13740 return Shl;
13741 }
13742 }
13743 }
13744
13745 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13746 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13747 isa<ConstantSDNode>(LHS.getOperand(2))) {
13748 uint32_t Sel = getConstantPermuteMask(Mask);
13749 if (!Sel)
13750 return SDValue();
13751
13752 // Select 0xc for all zero bytes
13753 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13754 SDLoc DL(N);
13755 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13756 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13757 }
13758 }
13759
13760 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13761 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13762 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13763 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13764 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13765
13766 SDValue X = LHS.getOperand(0);
13767 SDValue Y = RHS.getOperand(0);
13768 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13769 !isTypeLegal(X.getValueType()))
13770 return SDValue();
13771
13772 if (LCC == ISD::SETO) {
13773 if (X != LHS.getOperand(1))
13774 return SDValue();
13775
13776 if (RCC == ISD::SETUNE) {
13777 const ConstantFPSDNode *C1 =
13778 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13779 if (!C1 || !C1->isInfinity() || C1->isNegative())
13780 return SDValue();
13781
13782 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13786
13787 static_assert(
13790 0x3ff) == Mask,
13791 "mask not equal");
13792
13793 SDLoc DL(N);
13794 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13795 DAG.getConstant(Mask, DL, MVT::i32));
13796 }
13797 }
13798 }
13799
13800 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13801 std::swap(LHS, RHS);
13802
13803 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13804 RHS.hasOneUse()) {
13805 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13806 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13807 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13808 // | n_nan)
13809 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13810 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13811 (RHS.getOperand(0) == LHS.getOperand(0) &&
13812 LHS.getOperand(0) == LHS.getOperand(1))) {
13813 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13814 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13815 : Mask->getZExtValue() & OrdMask;
13816
13817 SDLoc DL(N);
13818 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13819 DAG.getConstant(NewMask, DL, MVT::i32));
13820 }
13821 }
13822
13823 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13824 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13825 // and x, (sext cc from i1) => select cc, x, 0
13826 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13827 std::swap(LHS, RHS);
13828 if (isBoolSGPR(RHS.getOperand(0)))
13829 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13830 DAG.getConstant(0, SDLoc(N), MVT::i32));
13831 }
13832
13833 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13834 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13835 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13836 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13837 uint32_t LHSMask = getPermuteMask(LHS);
13838 uint32_t RHSMask = getPermuteMask(RHS);
13839 if (LHSMask != ~0u && RHSMask != ~0u) {
13840 // Canonicalize the expression in an attempt to have fewer unique masks
13841 // and therefore fewer registers used to hold the masks.
13842 if (LHSMask > RHSMask) {
13843 std::swap(LHSMask, RHSMask);
13844 std::swap(LHS, RHS);
13845 }
13846
13847 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13848 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13849 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13850 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13851
13852 // Check of we need to combine values from two sources within a byte.
13853 if (!(LHSUsedLanes & RHSUsedLanes) &&
13854 // If we select high and lower word keep it for SDWA.
13855 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13856 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13857 // Each byte in each mask is either selector mask 0-3, or has higher
13858 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13859 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13860 // mask which is not 0xff wins. By anding both masks we have a correct
13861 // result except that 0x0c shall be corrected to give 0x0c only.
13862 uint32_t Mask = LHSMask & RHSMask;
13863 for (unsigned I = 0; I < 32; I += 8) {
13864 uint32_t ByteSel = 0xff << I;
13865 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13866 Mask &= (0x0c << I) & 0xffffffff;
13867 }
13868
13869 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13870 // or 0x0c.
13871 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13872 SDLoc DL(N);
13873
13874 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13875 RHS.getOperand(0),
13876 DAG.getConstant(Sel, DL, MVT::i32));
13877 }
13878 }
13879 }
13880
13881 return SDValue();
13882}
13883
13884// A key component of v_perm is a mapping between byte position of the src
13885// operands, and the byte position of the dest. To provide such, we need: 1. the
13886// node that provides x byte of the dest of the OR, and 2. the byte of the node
13887// used to provide that x byte. calculateByteProvider finds which node provides
13888// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13889// and finds an ultimate src and byte position For example: The supported
13890// LoadCombine pattern for vector loads is as follows
13891// t1
13892// or
13893// / \
13894// t2 t3
13895// zext shl
13896// | | \
13897// t4 t5 16
13898// or anyext
13899// / \ |
13900// t6 t7 t8
13901// srl shl or
13902// / | / \ / \
13903// t9 t10 t11 t12 t13 t14
13904// trunc* 8 trunc* 8 and and
13905// | | / | | \
13906// t15 t16 t17 t18 t19 t20
13907// trunc* 255 srl -256
13908// | / \
13909// t15 t15 16
13910//
13911// *In this example, the truncs are from i32->i16
13912//
13913// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13914// respectively. calculateSrcByte would find (given node) -> ultimate src &
13915// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13916// After finding the mapping, we can combine the tree into vperm t15, t16,
13917// 0x05000407
13918
13919// Find the source and byte position from a node.
13920// \p DestByte is the byte position of the dest of the or that the src
13921// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13922// dest of the or byte. \p Depth tracks how many recursive iterations we have
13923// performed.
13924static const std::optional<ByteProvider<SDValue>>
13925calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13926 unsigned Depth = 0) {
13927 // We may need to recursively traverse a series of SRLs
13928 if (Depth >= 6)
13929 return std::nullopt;
13930
13931 if (Op.getValueSizeInBits() < 8)
13932 return std::nullopt;
13933
13934 if (Op.getValueType().isVector())
13935 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13936
13937 switch (Op->getOpcode()) {
13938 case ISD::TRUNCATE: {
13939 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13940 }
13941
13942 case ISD::ANY_EXTEND:
13943 case ISD::SIGN_EXTEND:
13944 case ISD::ZERO_EXTEND:
13946 SDValue NarrowOp = Op->getOperand(0);
13947 auto NarrowVT = NarrowOp.getValueType();
13948 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13949 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13950 NarrowVT = VTSign->getVT();
13951 }
13952 if (!NarrowVT.isByteSized())
13953 return std::nullopt;
13954 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13955
13956 if (SrcIndex >= NarrowByteWidth)
13957 return std::nullopt;
13958 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13959 }
13960
13961 case ISD::SRA:
13962 case ISD::SRL: {
13963 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13964 if (!ShiftOp)
13965 return std::nullopt;
13966
13967 uint64_t BitShift = ShiftOp->getZExtValue();
13968
13969 if (BitShift % 8 != 0)
13970 return std::nullopt;
13971
13972 SrcIndex += BitShift / 8;
13973
13974 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13975 }
13976
13977 default: {
13978 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13979 }
13980 }
13981 llvm_unreachable("fully handled switch");
13982}
13983
13984// For a byte position in the result of an Or, traverse the tree and find the
13985// node (and the byte of the node) which ultimately provides this {Or,
13986// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13987// the byte position of the Op that corresponds with the originally requested
13988// byte of the Or \p Depth tracks how many recursive iterations we have
13989// performed. \p StartingIndex is the originally requested byte of the Or
13990static const std::optional<ByteProvider<SDValue>>
13991calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13992 unsigned StartingIndex = 0) {
13993 // Finding Src tree of RHS of or typically requires at least 1 additional
13994 // depth
13995 if (Depth > 6)
13996 return std::nullopt;
13997
13998 unsigned BitWidth = Op.getScalarValueSizeInBits();
13999 if (BitWidth % 8 != 0)
14000 return std::nullopt;
14001 if (Index > BitWidth / 8 - 1)
14002 return std::nullopt;
14003
14004 bool IsVec = Op.getValueType().isVector();
14005 switch (Op.getOpcode()) {
14006 case ISD::OR: {
14007 if (IsVec)
14008 return std::nullopt;
14009
14010 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
14011 StartingIndex);
14012 if (!RHS)
14013 return std::nullopt;
14014 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14015 StartingIndex);
14016 if (!LHS)
14017 return std::nullopt;
14018 // A well formed Or will have two ByteProviders for each byte, one of which
14019 // is constant zero
14020 if (!LHS->isConstantZero() && !RHS->isConstantZero())
14021 return std::nullopt;
14022 if (!LHS || LHS->isConstantZero())
14023 return RHS;
14024 if (!RHS || RHS->isConstantZero())
14025 return LHS;
14026 return std::nullopt;
14027 }
14028
14029 case ISD::AND: {
14030 if (IsVec)
14031 return std::nullopt;
14032
14033 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14034 if (!BitMaskOp)
14035 return std::nullopt;
14036
14037 uint32_t BitMask = BitMaskOp->getZExtValue();
14038 // Bits we expect for our StartingIndex
14039 uint32_t IndexMask = 0xFF << (Index * 8);
14040
14041 if ((IndexMask & BitMask) != IndexMask) {
14042 // If the result of the and partially provides the byte, then it
14043 // is not well formatted
14044 if (IndexMask & BitMask)
14045 return std::nullopt;
14047 }
14048
14049 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
14050 }
14051
14052 case ISD::FSHR: {
14053 if (IsVec)
14054 return std::nullopt;
14055
14056 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
14057 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14058 if (!ShiftOp || Op.getValueType().isVector())
14059 return std::nullopt;
14060
14061 uint64_t BitsProvided = Op.getValueSizeInBits();
14062 if (BitsProvided % 8 != 0)
14063 return std::nullopt;
14064
14065 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14066 if (BitShift % 8)
14067 return std::nullopt;
14068
14069 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14070 uint64_t ByteShift = BitShift / 8;
14071
14072 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14073 uint64_t BytesProvided = BitsProvided / 8;
14074 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14075 NewIndex %= BytesProvided;
14076 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
14077 }
14078
14079 case ISD::SRA:
14080 case ISD::SRL: {
14081 if (IsVec)
14082 return std::nullopt;
14083
14084 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14085 if (!ShiftOp)
14086 return std::nullopt;
14087
14088 uint64_t BitShift = ShiftOp->getZExtValue();
14089 if (BitShift % 8)
14090 return std::nullopt;
14091
14092 auto BitsProvided = Op.getScalarValueSizeInBits();
14093 if (BitsProvided % 8 != 0)
14094 return std::nullopt;
14095
14096 uint64_t BytesProvided = BitsProvided / 8;
14097 uint64_t ByteShift = BitShift / 8;
14098 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
14099 // If the byte we are trying to provide (as tracked by index) falls in this
14100 // range, then the SRL provides the byte. The byte of interest of the src of
14101 // the SRL is Index + ByteShift
14102 return BytesProvided - ByteShift > Index
14103 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
14104 Index + ByteShift)
14106 }
14107
14108 case ISD::SHL: {
14109 if (IsVec)
14110 return std::nullopt;
14111
14112 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14113 if (!ShiftOp)
14114 return std::nullopt;
14115
14116 uint64_t BitShift = ShiftOp->getZExtValue();
14117 if (BitShift % 8 != 0)
14118 return std::nullopt;
14119 uint64_t ByteShift = BitShift / 8;
14120
14121 // If we are shifting by an amount greater than (or equal to)
14122 // the index we are trying to provide, then it provides 0s. If not,
14123 // then this bytes are not definitively 0s, and the corresponding byte
14124 // of interest is Index - ByteShift of the src
14125 return Index < ByteShift
14127 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
14128 Depth + 1, StartingIndex);
14129 }
14130 case ISD::ANY_EXTEND:
14131 case ISD::SIGN_EXTEND:
14132 case ISD::ZERO_EXTEND:
14134 case ISD::AssertZext:
14135 case ISD::AssertSext: {
14136 if (IsVec)
14137 return std::nullopt;
14138
14139 SDValue NarrowOp = Op->getOperand(0);
14140 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14141 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14142 Op->getOpcode() == ISD::AssertZext ||
14143 Op->getOpcode() == ISD::AssertSext) {
14144 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14145 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14146 }
14147 if (NarrowBitWidth % 8 != 0)
14148 return std::nullopt;
14149 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14150
14151 if (Index >= NarrowByteWidth)
14152 return Op.getOpcode() == ISD::ZERO_EXTEND
14153 ? std::optional<ByteProvider<SDValue>>(
14155 : std::nullopt;
14156 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
14157 }
14158
14159 case ISD::TRUNCATE: {
14160 if (IsVec)
14161 return std::nullopt;
14162
14163 uint64_t NarrowByteWidth = BitWidth / 8;
14164
14165 if (NarrowByteWidth >= Index) {
14166 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14167 StartingIndex);
14168 }
14169
14170 return std::nullopt;
14171 }
14172
14173 case ISD::CopyFromReg: {
14174 if (BitWidth / 8 > Index)
14175 return calculateSrcByte(Op, StartingIndex, Index);
14176
14177 return std::nullopt;
14178 }
14179
14180 case ISD::LOAD: {
14181 auto *L = cast<LoadSDNode>(Op.getNode());
14182
14183 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14184 if (NarrowBitWidth % 8 != 0)
14185 return std::nullopt;
14186 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14187
14188 // If the width of the load does not reach byte we are trying to provide for
14189 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
14190 // question
14191 if (Index >= NarrowByteWidth) {
14192 return L->getExtensionType() == ISD::ZEXTLOAD
14193 ? std::optional<ByteProvider<SDValue>>(
14195 : std::nullopt;
14196 }
14197
14198 if (NarrowByteWidth > Index) {
14199 return calculateSrcByte(Op, StartingIndex, Index);
14200 }
14201
14202 return std::nullopt;
14203 }
14204
14205 case ISD::BSWAP: {
14206 if (IsVec)
14207 return std::nullopt;
14208
14209 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
14210 Depth + 1, StartingIndex);
14211 }
14212
14214 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14215 if (!IdxOp)
14216 return std::nullopt;
14217 auto VecIdx = IdxOp->getZExtValue();
14218 auto ScalarSize = Op.getScalarValueSizeInBits();
14219 if (ScalarSize < 32)
14220 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14221 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
14222 StartingIndex, Index);
14223 }
14224
14225 case AMDGPUISD::PERM: {
14226 if (IsVec)
14227 return std::nullopt;
14228
14229 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14230 if (!PermMask)
14231 return std::nullopt;
14232
14233 auto IdxMask =
14234 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14235 if (IdxMask > 0x07 && IdxMask != 0x0c)
14236 return std::nullopt;
14237
14238 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14239 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14240
14241 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
14244 }
14245
14246 default: {
14247 return std::nullopt;
14248 }
14249 }
14250
14251 llvm_unreachable("fully handled switch");
14252}
14253
14254// Returns true if the Operand is a scalar and is 16 bits
14255static bool isExtendedFrom16Bits(SDValue &Operand) {
14256
14257 switch (Operand.getOpcode()) {
14258 case ISD::ANY_EXTEND:
14259 case ISD::SIGN_EXTEND:
14260 case ISD::ZERO_EXTEND: {
14261 auto OpVT = Operand.getOperand(0).getValueType();
14262 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
14263 }
14264 case ISD::LOAD: {
14265 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
14266 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
14267 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
14268 ExtType == ISD::EXTLOAD) {
14269 auto MemVT = L->getMemoryVT();
14270 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
14271 }
14272 return L->getMemoryVT().getSizeInBits() == 16;
14273 }
14274 default:
14275 return false;
14276 }
14277}
14278
14279// Returns true if the mask matches consecutive bytes, and the first byte
14280// begins at a power of 2 byte offset from 0th byte
14281static bool addresses16Bits(int Mask) {
14282 int Low8 = Mask & 0xff;
14283 int Hi8 = (Mask & 0xff00) >> 8;
14284
14285 assert(Low8 < 8 && Hi8 < 8);
14286 // Are the bytes contiguous in the order of increasing addresses.
14287 bool IsConsecutive = (Hi8 - Low8 == 1);
14288 // Is the first byte at location that is aligned for 16 bit instructions.
14289 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
14290 // In this case, we still need code to extract the 16 bit operand, so it
14291 // is better to use i8 v_perm
14292 bool Is16Aligned = !(Low8 % 2);
14293
14294 return IsConsecutive && Is16Aligned;
14295}
14296
14297// Do not lower into v_perm if the operands are actually 16 bit
14298// and the selected bits (based on PermMask) correspond with two
14299// easily addressable 16 bit operands.
14301 SDValue &OtherOp) {
14302 int Low16 = PermMask & 0xffff;
14303 int Hi16 = (PermMask & 0xffff0000) >> 16;
14304
14305 auto TempOp = peekThroughBitcasts(Op);
14306 auto TempOtherOp = peekThroughBitcasts(OtherOp);
14307
14308 auto OpIs16Bit =
14309 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
14310 if (!OpIs16Bit)
14311 return true;
14312
14313 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14314 isExtendedFrom16Bits(TempOtherOp);
14315 if (!OtherOpIs16Bit)
14316 return true;
14317
14318 // Do we cleanly address both
14319 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
14320}
14321
14323 unsigned DWordOffset) {
14324 SDValue Ret;
14325
14326 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14327 // ByteProvider must be at least 8 bits
14328 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14329
14330 if (TypeSize <= 32)
14331 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
14332
14333 if (Src.getValueType().isVector()) {
14334 auto ScalarTySize = Src.getScalarValueSizeInBits();
14335 auto ScalarTy = Src.getValueType().getScalarType();
14336 if (ScalarTySize == 32) {
14337 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
14338 DAG.getConstant(DWordOffset, SL, MVT::i32));
14339 }
14340 if (ScalarTySize > 32) {
14341 Ret = DAG.getNode(
14342 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
14343 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14344 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14345 if (ShiftVal)
14346 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
14347 DAG.getConstant(ShiftVal, SL, MVT::i32));
14348 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14349 }
14350
14351 assert(ScalarTySize < 32);
14352 auto NumElements = TypeSize / ScalarTySize;
14353 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14354 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14355 auto NumElementsIn32 = 32 / ScalarTySize;
14356 auto NumAvailElements = DWordOffset < Trunc32Elements
14357 ? NumElementsIn32
14358 : NumElements - NormalizedTrunc;
14359
14361 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
14362 NumAvailElements);
14363
14364 Ret = DAG.getBuildVector(
14365 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
14366 VecSrcs);
14367 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14368 }
14369
14370 /// Scalar Type
14371 auto ShiftVal = 32 * DWordOffset;
14372 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
14373 DAG.getConstant(ShiftVal, SL, MVT::i32));
14374 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14375}
14376
14378 SelectionDAG &DAG = DCI.DAG;
14379 [[maybe_unused]] EVT VT = N->getValueType(0);
14381
14382 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14383 assert(VT == MVT::i32);
14384 for (int i = 0; i < 4; i++) {
14385 // Find the ByteProvider that provides the ith byte of the result of OR
14386 std::optional<ByteProvider<SDValue>> P =
14387 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14388 // TODO support constantZero
14389 if (!P || P->isConstantZero())
14390 return SDValue();
14391
14392 PermNodes.push_back(*P);
14393 }
14394 if (PermNodes.size() != 4)
14395 return SDValue();
14396
14397 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14398 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14399 uint64_t PermMask = 0x00000000;
14400 for (size_t i = 0; i < PermNodes.size(); i++) {
14401 auto PermOp = PermNodes[i];
14402 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14403 // by sizeof(Src2) = 4
14404 int SrcByteAdjust = 4;
14405
14406 // If the Src uses a byte from a different DWORD, then it corresponds
14407 // with a difference source
14408 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14409 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14410 if (SecondSrc)
14411 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14412 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14413 return SDValue();
14414
14415 // Set the index of the second distinct Src node
14416 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14417 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14418 SrcByteAdjust = 0;
14419 }
14420 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14422 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14423 }
14424 SDLoc DL(N);
14425 SDValue Op = *PermNodes[FirstSrc.first].Src;
14426 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14427 assert(Op.getValueSizeInBits() == 32);
14428
14429 // Check that we are not just extracting the bytes in order from an op
14430 if (!SecondSrc) {
14431 int Low16 = PermMask & 0xffff;
14432 int Hi16 = (PermMask & 0xffff0000) >> 16;
14433
14434 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14435 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14436
14437 // The perm op would really just produce Op. So combine into Op
14438 if (WellFormedLow && WellFormedHi)
14439 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14440 }
14441
14442 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14443
14444 if (SecondSrc) {
14445 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14446 assert(OtherOp.getValueSizeInBits() == 32);
14447 }
14448
14449 // Check that we haven't just recreated the same FSHR node.
14450 if (N->getOpcode() == ISD::FSHR &&
14451 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14452 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14453 return SDValue();
14454
14455 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14456
14457 assert(Op.getValueType().isByteSized() &&
14458 OtherOp.getValueType().isByteSized());
14459
14460 // If the ultimate src is less than 32 bits, then we will only be
14461 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14462 // CalculateByteProvider would not have returned Op as source if we
14463 // used a byte that is outside its ValueType. Thus, we are free to
14464 // ANY_EXTEND as the extended bits are dont-cares.
14465 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14466 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14467
14468 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14469 DAG.getConstant(PermMask, DL, MVT::i32));
14470 }
14471 return SDValue();
14472}
14473
14474SDValue SITargetLowering::performOrCombine(SDNode *N,
14475 DAGCombinerInfo &DCI) const {
14476 SelectionDAG &DAG = DCI.DAG;
14477 SDValue LHS = N->getOperand(0);
14478 SDValue RHS = N->getOperand(1);
14479
14480 EVT VT = N->getValueType(0);
14481 if (VT == MVT::i1) {
14482 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14483 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14484 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14485 SDValue Src = LHS.getOperand(0);
14486 if (Src != RHS.getOperand(0))
14487 return SDValue();
14488
14489 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14490 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14491 if (!CLHS || !CRHS)
14492 return SDValue();
14493
14494 // Only 10 bits are used.
14495 static const uint32_t MaxMask = 0x3ff;
14496
14497 uint32_t NewMask =
14498 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14499 SDLoc DL(N);
14500 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14501 DAG.getConstant(NewMask, DL, MVT::i32));
14502 }
14503
14504 return SDValue();
14505 }
14506
14507 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14509 LHS.getOpcode() == AMDGPUISD::PERM &&
14510 isa<ConstantSDNode>(LHS.getOperand(2))) {
14511 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14512 if (!Sel)
14513 return SDValue();
14514
14515 Sel |= LHS.getConstantOperandVal(2);
14516 SDLoc DL(N);
14517 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14518 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14519 }
14520
14521 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14522 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14523 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14524 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14525
14526 // If all the uses of an or need to extract the individual elements, do not
14527 // attempt to lower into v_perm
14528 auto usesCombinedOperand = [](SDNode *OrUse) {
14529 // If we have any non-vectorized use, then it is a candidate for v_perm
14530 if (OrUse->getOpcode() != ISD::BITCAST ||
14531 !OrUse->getValueType(0).isVector())
14532 return true;
14533
14534 // If we have any non-vectorized use, then it is a candidate for v_perm
14535 for (auto *VUser : OrUse->users()) {
14536 if (!VUser->getValueType(0).isVector())
14537 return true;
14538
14539 // If the use of a vector is a store, then combining via a v_perm
14540 // is beneficial.
14541 // TODO -- whitelist more uses
14542 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14543 if (VUser->getOpcode() == VectorwiseOp)
14544 return true;
14545 }
14546 return false;
14547 };
14548
14549 if (!any_of(N->users(), usesCombinedOperand))
14550 return SDValue();
14551
14552 uint32_t LHSMask = getPermuteMask(LHS);
14553 uint32_t RHSMask = getPermuteMask(RHS);
14554
14555 if (LHSMask != ~0u && RHSMask != ~0u) {
14556 // Canonicalize the expression in an attempt to have fewer unique masks
14557 // and therefore fewer registers used to hold the masks.
14558 if (LHSMask > RHSMask) {
14559 std::swap(LHSMask, RHSMask);
14560 std::swap(LHS, RHS);
14561 }
14562
14563 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14564 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14565 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14566 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14567
14568 // Check of we need to combine values from two sources within a byte.
14569 if (!(LHSUsedLanes & RHSUsedLanes) &&
14570 // If we select high and lower word keep it for SDWA.
14571 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14572 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14573 // Kill zero bytes selected by other mask. Zero value is 0xc.
14574 LHSMask &= ~RHSUsedLanes;
14575 RHSMask &= ~LHSUsedLanes;
14576 // Add 4 to each active LHS lane
14577 LHSMask |= LHSUsedLanes & 0x04040404;
14578 // Combine masks
14579 uint32_t Sel = LHSMask | RHSMask;
14580 SDLoc DL(N);
14581
14582 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14583 RHS.getOperand(0),
14584 DAG.getConstant(Sel, DL, MVT::i32));
14585 }
14586 }
14587 if (LHSMask == ~0u || RHSMask == ~0u) {
14588 if (SDValue Perm = matchPERM(N, DCI))
14589 return Perm;
14590 }
14591 }
14592
14593 // Detect identity v2i32 OR and replace with identity source node.
14594 // Specifically an Or that has operands constructed from the same source node
14595 // via extract_vector_elt and build_vector. I.E.
14596 // v2i32 or(
14597 // v2i32 build_vector(
14598 // i32 extract_elt(%IdentitySrc, 0),
14599 // i32 0
14600 // ),
14601 // v2i32 build_vector(
14602 // i32 0,
14603 // i32 extract_elt(%IdentitySrc, 1)
14604 // ) )
14605 // =>
14606 // v2i32 %IdentitySrc
14607
14608 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14609 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14610
14611 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14612 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14613
14614 // Test for and normalise build vectors.
14615 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14616
14617 // Get the extract_vector_element operands.
14618 SDValue LEVE = LHS->getOperand(0);
14619 SDValue REVE = RHS->getOperand(1);
14620
14621 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14623 // Check that different elements from the same vector are
14624 // extracted.
14625 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14626 LEVE->getOperand(1) != REVE->getOperand(1)) {
14627 SDValue IdentitySrc = LEVE.getOperand(0);
14628 return IdentitySrc;
14629 }
14630 }
14631 }
14632 }
14633
14634 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14635 return SDValue();
14636
14637 // TODO: This could be a generic combine with a predicate for extracting the
14638 // high half of an integer being free.
14639
14640 // (or i64:x, (zero_extend i32:y)) ->
14641 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14642 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14643 RHS.getOpcode() != ISD::ZERO_EXTEND)
14644 std::swap(LHS, RHS);
14645
14646 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14647 SDValue ExtSrc = RHS.getOperand(0);
14648 EVT SrcVT = ExtSrc.getValueType();
14649 if (SrcVT == MVT::i32) {
14650 SDLoc SL(N);
14651 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14652 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14653
14654 DCI.AddToWorklist(LowOr.getNode());
14655 DCI.AddToWorklist(HiBits.getNode());
14656
14657 SDValue Vec =
14658 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14659 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14660 }
14661 }
14662
14663 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14664 if (CRHS) {
14665 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14666 N->getOperand(0), CRHS))
14667 return Split;
14668 }
14669
14670 return SDValue();
14671}
14672
14673SDValue SITargetLowering::performXorCombine(SDNode *N,
14674 DAGCombinerInfo &DCI) const {
14675 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14676 return RV;
14677
14678 SDValue LHS = N->getOperand(0);
14679 SDValue RHS = N->getOperand(1);
14680
14681 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14682 SelectionDAG &DAG = DCI.DAG;
14683
14684 EVT VT = N->getValueType(0);
14685 if (CRHS && VT == MVT::i64) {
14686 if (SDValue Split =
14687 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14688 return Split;
14689 }
14690
14691 // v2i32 (xor (vselect cc, x, y), K) ->
14692 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14693 // replaced with source modifiers when the select is lowered to CNDMASK.
14694 unsigned Opc = LHS.getOpcode();
14695 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14696 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14697 CRHS && CRHS->getAPIntValue().isSignMask()) {
14698 SDValue CC = LHS->getOperand(0);
14699 SDValue TRUE = LHS->getOperand(1);
14700 SDValue FALSE = LHS->getOperand(2);
14701 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14702 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14703 SDValue XSelect =
14704 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14705 return XSelect;
14706 }
14707
14708 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14709 // fneg-like xors into 64-bit select.
14710 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14711 // This looks like an fneg, try to fold as a source modifier.
14712 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14714 // xor (select c, a, b), 0x80000000 ->
14715 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14716 SDLoc DL(N);
14717 SDValue CastLHS =
14718 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14719 SDValue CastRHS =
14720 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14721 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14722 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14723 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14724 LHS->getOperand(0), FNegLHS, FNegRHS);
14725 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14726 }
14727 }
14728
14729 return SDValue();
14730}
14731
14732SDValue
14733SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
14734 DAGCombinerInfo &DCI) const {
14735 if (!Subtarget->has16BitInsts() ||
14736 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
14737 return SDValue();
14738
14739 EVT VT = N->getValueType(0);
14740 if (VT != MVT::i32)
14741 return SDValue();
14742
14743 SDValue Src = N->getOperand(0);
14744 if (Src.getValueType() != MVT::i16)
14745 return SDValue();
14746
14747 if (!Src->hasOneUse())
14748 return SDValue();
14749
14750 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
14751 // possible we're missing out on some combine opportunities, but we'd need to
14752 // weigh the cost of extracting the byte from the upper dwords.
14753
14754 std::optional<ByteProvider<SDValue>> BP0 =
14755 calculateByteProvider(SDValue(N, 0), 0, 0, 0);
14756 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14757 return SDValue();
14758 SDValue V0 = *BP0->Src;
14759
14760 std::optional<ByteProvider<SDValue>> BP1 =
14761 calculateByteProvider(SDValue(N, 0), 1, 0, 1);
14762 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14763 return SDValue();
14764
14765 SDValue V1 = *BP1->Src;
14766
14767 if (V0 == V1)
14768 return SDValue();
14769
14770 SelectionDAG &DAG = DCI.DAG;
14771 SDLoc DL(N);
14772 uint32_t PermMask = 0x0c0c0c0c;
14773 if (V0) {
14774 V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32);
14775 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14776 }
14777
14778 if (V1) {
14779 V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32);
14780 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14781 }
14782
14783 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1,
14784 DAG.getConstant(PermMask, DL, MVT::i32));
14785}
14786
14787SDValue
14788SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14789 DAGCombinerInfo &DCI) const {
14790 SDValue Src = N->getOperand(0);
14791 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14792
14793 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14794 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14795 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14796 VTSign->getVT() == MVT::i8) ||
14797 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14798 VTSign->getVT() == MVT::i16))) {
14799 assert(Subtarget->hasScalarSubwordLoads() &&
14800 "s_buffer_load_{u8, i8} are supported "
14801 "in GFX12 (or newer) architectures.");
14802 EVT VT = Src.getValueType();
14803 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14804 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14805 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14806 SDLoc DL(N);
14807 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14808 SDValue Ops[] = {
14809 Src.getOperand(0), // source register
14810 Src.getOperand(1), // offset
14811 Src.getOperand(2) // cachePolicy
14812 };
14813 auto *M = cast<MemSDNode>(Src);
14814 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14815 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14816 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14817 return LoadVal;
14818 }
14819 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14820 VTSign->getVT() == MVT::i8) ||
14821 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14822 VTSign->getVT() == MVT::i16)) &&
14823 Src.hasOneUse()) {
14824 auto *M = cast<MemSDNode>(Src);
14825 SDValue Ops[] = {Src.getOperand(0), // Chain
14826 Src.getOperand(1), // rsrc
14827 Src.getOperand(2), // vindex
14828 Src.getOperand(3), // voffset
14829 Src.getOperand(4), // soffset
14830 Src.getOperand(5), // offset
14831 Src.getOperand(6), Src.getOperand(7)};
14832 // replace with BUFFER_LOAD_BYTE/SHORT
14833 SDVTList ResList =
14834 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14835 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14836 ? AMDGPUISD::BUFFER_LOAD_BYTE
14837 : AMDGPUISD::BUFFER_LOAD_SHORT;
14838 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14839 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14840 return DCI.DAG.getMergeValues(
14841 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14842 }
14843 return SDValue();
14844}
14845
14846SDValue SITargetLowering::performClassCombine(SDNode *N,
14847 DAGCombinerInfo &DCI) const {
14848 SelectionDAG &DAG = DCI.DAG;
14849 SDValue Mask = N->getOperand(1);
14850
14851 // fp_class x, 0 -> false
14852 if (isNullConstant(Mask))
14853 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14854
14855 if (N->getOperand(0).isUndef())
14856 return DAG.getUNDEF(MVT::i1);
14857
14858 return SDValue();
14859}
14860
14861SDValue SITargetLowering::performRcpCombine(SDNode *N,
14862 DAGCombinerInfo &DCI) const {
14863 EVT VT = N->getValueType(0);
14864 SDValue N0 = N->getOperand(0);
14865
14866 if (N0.isUndef()) {
14867 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14868 SDLoc(N), VT);
14869 }
14870
14871 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14872 N0.getOpcode() == ISD::SINT_TO_FP)) {
14873 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14874 N->getFlags());
14875 }
14876
14877 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14878 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14879 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14880 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14881 N->getFlags());
14882 }
14883
14885}
14886
14888 SDNodeFlags UserFlags,
14889 unsigned MaxDepth) const {
14890 unsigned Opcode = Op.getOpcode();
14891 if (Opcode == ISD::FCANONICALIZE)
14892 return true;
14893
14894 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14895 const auto &F = CFP->getValueAPF();
14896 if (F.isNaN() && F.isSignaling())
14897 return false;
14898 if (!F.isDenormal())
14899 return true;
14900
14901 DenormalMode Mode =
14902 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14903 return Mode == DenormalMode::getIEEE();
14904 }
14905
14906 // If source is a result of another standard FP operation it is already in
14907 // canonical form.
14908 if (MaxDepth == 0)
14909 return false;
14910
14911 switch (Opcode) {
14912 // These will flush denorms if required.
14913 case ISD::FADD:
14914 case ISD::FSUB:
14915 case ISD::FMUL:
14916 case ISD::FCEIL:
14917 case ISD::FFLOOR:
14918 case ISD::FMA:
14919 case ISD::FMAD:
14920 case ISD::FSQRT:
14921 case ISD::FDIV:
14922 case ISD::FREM:
14923 case ISD::FP_ROUND:
14924 case ISD::FP_EXTEND:
14925 case ISD::FP16_TO_FP:
14926 case ISD::FP_TO_FP16:
14927 case ISD::BF16_TO_FP:
14928 case ISD::FP_TO_BF16:
14929 case ISD::FLDEXP:
14930 case AMDGPUISD::FMUL_LEGACY:
14931 case AMDGPUISD::FMAD_FTZ:
14932 case AMDGPUISD::RCP:
14933 case AMDGPUISD::RSQ:
14934 case AMDGPUISD::RSQ_CLAMP:
14935 case AMDGPUISD::RCP_LEGACY:
14936 case AMDGPUISD::RCP_IFLAG:
14937 case AMDGPUISD::LOG:
14938 case AMDGPUISD::EXP:
14939 case AMDGPUISD::DIV_SCALE:
14940 case AMDGPUISD::DIV_FMAS:
14941 case AMDGPUISD::DIV_FIXUP:
14942 case AMDGPUISD::FRACT:
14943 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14944 case AMDGPUISD::CVT_F32_UBYTE0:
14945 case AMDGPUISD::CVT_F32_UBYTE1:
14946 case AMDGPUISD::CVT_F32_UBYTE2:
14947 case AMDGPUISD::CVT_F32_UBYTE3:
14948 case AMDGPUISD::FP_TO_FP16:
14949 case AMDGPUISD::SIN_HW:
14950 case AMDGPUISD::COS_HW:
14951 return true;
14952
14953 // It can/will be lowered or combined as a bit operation.
14954 // Need to check their input recursively to handle.
14955 case ISD::FNEG:
14956 case ISD::FABS:
14957 case ISD::FCOPYSIGN:
14958 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14959
14960 case ISD::AND:
14961 if (Op.getValueType() == MVT::i32) {
14962 // Be careful as we only know it is a bitcast floating point type. It
14963 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14964 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14965 // is valid to optimize for all types.
14966 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14967 if (RHS->getZExtValue() == 0xffff0000) {
14968 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14969 }
14970 }
14971 }
14972 break;
14973
14974 case ISD::FSIN:
14975 case ISD::FCOS:
14976 case ISD::FSINCOS:
14977 return Op.getValueType().getScalarType() != MVT::f16;
14978
14979 case ISD::FMINNUM:
14980 case ISD::FMAXNUM:
14981 case ISD::FMINNUM_IEEE:
14982 case ISD::FMAXNUM_IEEE:
14983 case ISD::FMINIMUM:
14984 case ISD::FMAXIMUM:
14985 case ISD::FMINIMUMNUM:
14986 case ISD::FMAXIMUMNUM:
14987 case AMDGPUISD::CLAMP:
14988 case AMDGPUISD::FMED3:
14989 case AMDGPUISD::FMAX3:
14990 case AMDGPUISD::FMIN3:
14991 case AMDGPUISD::FMAXIMUM3:
14992 case AMDGPUISD::FMINIMUM3: {
14993 // FIXME: Shouldn't treat the generic operations different based these.
14994 // However, we aren't really required to flush the result from
14995 // minnum/maxnum..
14996
14997 // snans will be quieted, so we only need to worry about denormals.
14998 if (Subtarget->supportsMinMaxDenormModes() ||
14999 // FIXME: denormalsEnabledForType is broken for dynamic
15000 denormalsEnabledForType(DAG, Op.getValueType()))
15001 return true;
15002
15003 // Flushing may be required.
15004 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
15005 // targets need to check their input recursively.
15006
15007 // FIXME: Does this apply with clamp? It's implemented with max.
15008 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
15009 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
15010 return false;
15011 }
15012
15013 return true;
15014 }
15015 case ISD::SELECT: {
15016 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
15017 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
15018 }
15019 case ISD::BUILD_VECTOR: {
15020 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
15021 SDValue SrcOp = Op.getOperand(i);
15022 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
15023 return false;
15024 }
15025
15026 return true;
15027 }
15030 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15031 }
15033 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
15034 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
15035 }
15036 case ISD::UNDEF:
15037 // Could be anything.
15038 return false;
15039
15040 case ISD::BITCAST:
15041 // TODO: This is incorrect as it loses track of the operand's type. We may
15042 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
15043 // same bits that are canonicalized in one type need not be in the other.
15044 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15045 case ISD::TRUNCATE: {
15046 // Hack round the mess we make when legalizing extract_vector_elt
15047 if (Op.getValueType() == MVT::i16) {
15048 SDValue TruncSrc = Op.getOperand(0);
15049 if (TruncSrc.getValueType() == MVT::i32 &&
15050 TruncSrc.getOpcode() == ISD::BITCAST &&
15051 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
15052 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
15053 }
15054 }
15055 return false;
15056 }
15058 unsigned IntrinsicID = Op.getConstantOperandVal(0);
15059 // TODO: Handle more intrinsics
15060 switch (IntrinsicID) {
15061 case Intrinsic::amdgcn_cvt_pkrtz:
15062 case Intrinsic::amdgcn_cubeid:
15063 case Intrinsic::amdgcn_frexp_mant:
15064 case Intrinsic::amdgcn_fdot2:
15065 case Intrinsic::amdgcn_rcp:
15066 case Intrinsic::amdgcn_rsq:
15067 case Intrinsic::amdgcn_rsq_clamp:
15068 case Intrinsic::amdgcn_rcp_legacy:
15069 case Intrinsic::amdgcn_rsq_legacy:
15070 case Intrinsic::amdgcn_trig_preop:
15071 case Intrinsic::amdgcn_tanh:
15072 case Intrinsic::amdgcn_log:
15073 case Intrinsic::amdgcn_exp2:
15074 case Intrinsic::amdgcn_sqrt:
15075 return true;
15076 default:
15077 break;
15078 }
15079
15080 break;
15081 }
15082 default:
15083 break;
15084 }
15085
15086 // FIXME: denormalsEnabledForType is broken for dynamic
15087 return denormalsEnabledForType(DAG, Op.getValueType()) &&
15088 (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
15089}
15090
15092 unsigned MaxDepth) const {
15093 const MachineRegisterInfo &MRI = MF.getRegInfo();
15094 MachineInstr *MI = MRI.getVRegDef(Reg);
15095 unsigned Opcode = MI->getOpcode();
15096
15097 if (Opcode == AMDGPU::G_FCANONICALIZE)
15098 return true;
15099
15100 std::optional<FPValueAndVReg> FCR;
15101 // Constant splat (can be padded with undef) or scalar constant.
15103 if (FCR->Value.isSignaling())
15104 return false;
15105 if (!FCR->Value.isDenormal())
15106 return true;
15107
15108 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
15109 return Mode == DenormalMode::getIEEE();
15110 }
15111
15112 if (MaxDepth == 0)
15113 return false;
15114
15115 switch (Opcode) {
15116 case AMDGPU::G_FADD:
15117 case AMDGPU::G_FSUB:
15118 case AMDGPU::G_FMUL:
15119 case AMDGPU::G_FCEIL:
15120 case AMDGPU::G_FFLOOR:
15121 case AMDGPU::G_FRINT:
15122 case AMDGPU::G_FNEARBYINT:
15123 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15124 case AMDGPU::G_INTRINSIC_TRUNC:
15125 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15126 case AMDGPU::G_FMA:
15127 case AMDGPU::G_FMAD:
15128 case AMDGPU::G_FSQRT:
15129 case AMDGPU::G_FDIV:
15130 case AMDGPU::G_FREM:
15131 case AMDGPU::G_FPOW:
15132 case AMDGPU::G_FPEXT:
15133 case AMDGPU::G_FLOG:
15134 case AMDGPU::G_FLOG2:
15135 case AMDGPU::G_FLOG10:
15136 case AMDGPU::G_FPTRUNC:
15137 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15138 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15139 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15140 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15141 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15142 return true;
15143 case AMDGPU::G_FNEG:
15144 case AMDGPU::G_FABS:
15145 case AMDGPU::G_FCOPYSIGN:
15146 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
15147 case AMDGPU::G_FMINNUM:
15148 case AMDGPU::G_FMAXNUM:
15149 case AMDGPU::G_FMINNUM_IEEE:
15150 case AMDGPU::G_FMAXNUM_IEEE:
15151 case AMDGPU::G_FMINIMUM:
15152 case AMDGPU::G_FMAXIMUM:
15153 case AMDGPU::G_FMINIMUMNUM:
15154 case AMDGPU::G_FMAXIMUMNUM: {
15155 if (Subtarget->supportsMinMaxDenormModes() ||
15156 // FIXME: denormalsEnabledForType is broken for dynamic
15157 denormalsEnabledForType(MRI.getType(Reg), MF))
15158 return true;
15159
15160 [[fallthrough]];
15161 }
15162 case AMDGPU::G_BUILD_VECTOR:
15163 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
15164 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
15165 return false;
15166 return true;
15167 case AMDGPU::G_INTRINSIC:
15168 case AMDGPU::G_INTRINSIC_CONVERGENT:
15169 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15170 case Intrinsic::amdgcn_fmul_legacy:
15171 case Intrinsic::amdgcn_fmad_ftz:
15172 case Intrinsic::amdgcn_sqrt:
15173 case Intrinsic::amdgcn_fmed3:
15174 case Intrinsic::amdgcn_sin:
15175 case Intrinsic::amdgcn_cos:
15176 case Intrinsic::amdgcn_log:
15177 case Intrinsic::amdgcn_exp2:
15178 case Intrinsic::amdgcn_log_clamp:
15179 case Intrinsic::amdgcn_rcp:
15180 case Intrinsic::amdgcn_rcp_legacy:
15181 case Intrinsic::amdgcn_rsq:
15182 case Intrinsic::amdgcn_rsq_clamp:
15183 case Intrinsic::amdgcn_rsq_legacy:
15184 case Intrinsic::amdgcn_div_scale:
15185 case Intrinsic::amdgcn_div_fmas:
15186 case Intrinsic::amdgcn_div_fixup:
15187 case Intrinsic::amdgcn_fract:
15188 case Intrinsic::amdgcn_cvt_pkrtz:
15189 case Intrinsic::amdgcn_cubeid:
15190 case Intrinsic::amdgcn_cubema:
15191 case Intrinsic::amdgcn_cubesc:
15192 case Intrinsic::amdgcn_cubetc:
15193 case Intrinsic::amdgcn_frexp_mant:
15194 case Intrinsic::amdgcn_fdot2:
15195 case Intrinsic::amdgcn_trig_preop:
15196 case Intrinsic::amdgcn_tanh:
15197 return true;
15198 default:
15199 break;
15200 }
15201
15202 [[fallthrough]];
15203 default:
15204 return false;
15205 }
15206
15207 llvm_unreachable("invalid operation");
15208}
15209
15210// Constant fold canonicalize.
15211SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
15212 const SDLoc &SL, EVT VT,
15213 const APFloat &C) const {
15214 // Flush denormals to 0 if not enabled.
15215 if (C.isDenormal()) {
15216 DenormalMode Mode =
15217 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
15218 if (Mode == DenormalMode::getPreserveSign()) {
15219 return DAG.getConstantFP(
15220 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
15221 }
15222
15223 if (Mode != DenormalMode::getIEEE())
15224 return SDValue();
15225 }
15226
15227 if (C.isNaN()) {
15228 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
15229 if (C.isSignaling()) {
15230 // Quiet a signaling NaN.
15231 // FIXME: Is this supposed to preserve payload bits?
15232 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15233 }
15234
15235 // Make sure it is the canonical NaN bitpattern.
15236 //
15237 // TODO: Can we use -1 as the canonical NaN value since it's an inline
15238 // immediate?
15239 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
15240 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15241 }
15242
15243 // Already canonical.
15244 return DAG.getConstantFP(C, SL, VT);
15245}
15246
15248 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
15249}
15250
15251SDValue
15252SITargetLowering::performFCanonicalizeCombine(SDNode *N,
15253 DAGCombinerInfo &DCI) const {
15254 SelectionDAG &DAG = DCI.DAG;
15255 SDValue N0 = N->getOperand(0);
15256 EVT VT = N->getValueType(0);
15257
15258 // fcanonicalize undef -> qnan
15259 if (N0.isUndef()) {
15261 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
15262 }
15263
15264 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
15265 EVT VT = N->getValueType(0);
15266 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
15267 }
15268
15269 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
15270 // (fcanonicalize k)
15271 //
15272 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
15273
15274 // TODO: This could be better with wider vectors that will be split to v2f16,
15275 // and to consider uses since there aren't that many packed operations.
15276 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
15277 isTypeLegal(MVT::v2f16)) {
15278 SDLoc SL(N);
15279 SDValue NewElts[2];
15280 SDValue Lo = N0.getOperand(0);
15281 SDValue Hi = N0.getOperand(1);
15282 EVT EltVT = Lo.getValueType();
15283
15285 for (unsigned I = 0; I != 2; ++I) {
15286 SDValue Op = N0.getOperand(I);
15287 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
15288 NewElts[I] =
15289 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15290 } else if (Op.isUndef()) {
15291 // Handled below based on what the other operand is.
15292 NewElts[I] = Op;
15293 } else {
15294 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
15295 }
15296 }
15297
15298 // If one half is undef, and one is constant, prefer a splat vector rather
15299 // than the normal qNaN. If it's a register, prefer 0.0 since that's
15300 // cheaper to use and may be free with a packed operation.
15301 if (NewElts[0].isUndef()) {
15302 if (isa<ConstantFPSDNode>(NewElts[1]))
15303 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
15304 ? NewElts[1]
15305 : DAG.getConstantFP(0.0f, SL, EltVT);
15306 }
15307
15308 if (NewElts[1].isUndef()) {
15309 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
15310 ? NewElts[0]
15311 : DAG.getConstantFP(0.0f, SL, EltVT);
15312 }
15313
15314 return DAG.getBuildVector(VT, SL, NewElts);
15315 }
15316 }
15317
15318 return SDValue();
15319}
15320
15321static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15322 switch (Opc) {
15323 case ISD::FMAXNUM:
15324 case ISD::FMAXNUM_IEEE:
15325 case ISD::FMAXIMUMNUM:
15326 return AMDGPUISD::FMAX3;
15327 case ISD::FMAXIMUM:
15328 return AMDGPUISD::FMAXIMUM3;
15329 case ISD::SMAX:
15330 return AMDGPUISD::SMAX3;
15331 case ISD::UMAX:
15332 return AMDGPUISD::UMAX3;
15333 case ISD::FMINNUM:
15334 case ISD::FMINNUM_IEEE:
15335 case ISD::FMINIMUMNUM:
15336 return AMDGPUISD::FMIN3;
15337 case ISD::FMINIMUM:
15338 return AMDGPUISD::FMINIMUM3;
15339 case ISD::SMIN:
15340 return AMDGPUISD::SMIN3;
15341 case ISD::UMIN:
15342 return AMDGPUISD::UMIN3;
15343 default:
15344 llvm_unreachable("Not a min/max opcode");
15345 }
15346}
15347
15348SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15349 const SDLoc &SL, SDValue Src,
15350 SDValue MinVal,
15351 SDValue MaxVal,
15352 bool Signed) const {
15353
15354 // med3 comes from
15355 // min(max(x, K0), K1), K0 < K1
15356 // max(min(x, K0), K1), K1 < K0
15357 //
15358 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15359 // min/max op.
15360 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
15361 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
15362
15363 if (!MinK || !MaxK)
15364 return SDValue();
15365
15366 if (Signed) {
15367 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
15368 return SDValue();
15369 } else {
15370 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
15371 return SDValue();
15372 }
15373
15374 EVT VT = MinK->getValueType(0);
15375 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15376 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15377 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15378
15379 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15380 // not available, but this is unlikely to be profitable as constants
15381 // will often need to be materialized & extended, especially on
15382 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15383 return SDValue();
15384}
15385
15388 return C;
15389
15391 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15392 return C;
15393 }
15394
15395 return nullptr;
15396}
15397
15398SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15399 const SDLoc &SL, SDValue Op0,
15400 SDValue Op1) const {
15401 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
15402 if (!K1)
15403 return SDValue();
15404
15405 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
15406 if (!K0)
15407 return SDValue();
15408
15409 // Ordered >= (although NaN inputs should have folded away by now).
15410 if (K0->getValueAPF() > K1->getValueAPF())
15411 return SDValue();
15412
15413 // med3 with a nan input acts like
15414 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15415 //
15416 // So the result depends on whether the IEEE mode bit is enabled or not with a
15417 // signaling nan input.
15418 // ieee=1
15419 // s0 snan: yields s2
15420 // s1 snan: yields s2
15421 // s2 snan: qnan
15422
15423 // s0 qnan: min(s1, s2)
15424 // s1 qnan: min(s0, s2)
15425 // s2 qnan: min(s0, s1)
15426
15427 // ieee=0
15428 // s0 snan: min(s1, s2)
15429 // s1 snan: min(s0, s2)
15430 // s2 snan: qnan
15431
15432 // s0 qnan: min(s1, s2)
15433 // s1 qnan: min(s0, s2)
15434 // s2 qnan: min(s0, s1)
15435 const MachineFunction &MF = DAG.getMachineFunction();
15436 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15437
15438 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15439 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15440 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15441 EVT VT = Op0.getValueType();
15442 if (Info->getMode().DX10Clamp) {
15443 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15444 // hardware fmed3 behavior converting to a min.
15445 // FIXME: Should this be allowing -0.0?
15446 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15447 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15448 }
15449
15450 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15451 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15452 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15453 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15454 // then give the other result, which is different from med3 with a NaN
15455 // input.
15456 SDValue Var = Op0.getOperand(0);
15457 if (!DAG.isKnownNeverSNaN(Var))
15458 return SDValue();
15459
15460 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15461
15462 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15463 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15464 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15465 SDValue(K0, 0), SDValue(K1, 0));
15466 }
15467 }
15468
15469 return SDValue();
15470}
15471
15472/// \return true if the subtarget supports minimum3 and maximum3 with the given
15473/// base min/max opcode \p Opc for type \p VT.
15474static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15475 EVT VT) {
15476 switch (Opc) {
15477 case ISD::FMINNUM:
15478 case ISD::FMAXNUM:
15479 case ISD::FMINNUM_IEEE:
15480 case ISD::FMAXNUM_IEEE:
15481 case ISD::FMINIMUMNUM:
15482 case ISD::FMAXIMUMNUM:
15483 case AMDGPUISD::FMIN_LEGACY:
15484 case AMDGPUISD::FMAX_LEGACY:
15485 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15486 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15487 case ISD::FMINIMUM:
15488 case ISD::FMAXIMUM:
15489 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15490 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15491 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15492 case ISD::SMAX:
15493 case ISD::SMIN:
15494 case ISD::UMAX:
15495 case ISD::UMIN:
15496 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15497 default:
15498 return false;
15499 }
15500
15501 llvm_unreachable("not a min/max opcode");
15502}
15503
15504SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15505 DAGCombinerInfo &DCI) const {
15506 SelectionDAG &DAG = DCI.DAG;
15507
15508 EVT VT = N->getValueType(0);
15509 unsigned Opc = N->getOpcode();
15510 SDValue Op0 = N->getOperand(0);
15511 SDValue Op1 = N->getOperand(1);
15512
15513 // Only do this if the inner op has one use since this will just increases
15514 // register pressure for no benefit.
15515
15516 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15517 // max(max(a, b), c) -> max3(a, b, c)
15518 // min(min(a, b), c) -> min3(a, b, c)
15519 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15520 SDLoc DL(N);
15521 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15522 Op0.getOperand(0), Op0.getOperand(1), Op1);
15523 }
15524
15525 // Try commuted.
15526 // max(a, max(b, c)) -> max3(a, b, c)
15527 // min(a, min(b, c)) -> min3(a, b, c)
15528 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15529 SDLoc DL(N);
15530 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15531 Op0, Op1.getOperand(0), Op1.getOperand(1));
15532 }
15533 }
15534
15535 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15536 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15537 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15538 if (SDValue Med3 = performIntMed3ImmCombine(
15539 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15540 return Med3;
15541 }
15542 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15543 if (SDValue Med3 = performIntMed3ImmCombine(
15544 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15545 return Med3;
15546 }
15547
15548 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15549 if (SDValue Med3 = performIntMed3ImmCombine(
15550 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15551 return Med3;
15552 }
15553 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15554 if (SDValue Med3 = performIntMed3ImmCombine(
15555 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15556 return Med3;
15557 }
15558
15559 // if !is_snan(x):
15560 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15561 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15562 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15563 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15564 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15567 (Opc == AMDGPUISD::FMIN_LEGACY &&
15568 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15569 (VT == MVT::f32 || VT == MVT::f64 ||
15570 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15571 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15572 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15573 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15574 Op0.hasOneUse()) {
15575 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15576 return Res;
15577 }
15578
15579 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15580 // for some types, but at a higher cost since it's implemented with a 3
15581 // operand form.
15582 const SDNodeFlags Flags = N->getFlags();
15583 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
15584 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15586 unsigned NewOpc =
15588 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15589 }
15590
15591 return SDValue();
15592}
15593
15597 // FIXME: Should this be allowing -0.0?
15598 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15599 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15600 }
15601 }
15602
15603 return false;
15604}
15605
15606// FIXME: Should only worry about snans for version with chain.
15607SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15608 DAGCombinerInfo &DCI) const {
15609 EVT VT = N->getValueType(0);
15610 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15611 // NaNs. With a NaN input, the order of the operands may change the result.
15612
15613 SelectionDAG &DAG = DCI.DAG;
15614 SDLoc SL(N);
15615
15616 SDValue Src0 = N->getOperand(0);
15617 SDValue Src1 = N->getOperand(1);
15618 SDValue Src2 = N->getOperand(2);
15619
15620 if (isClampZeroToOne(Src0, Src1)) {
15621 // const_a, const_b, x -> clamp is safe in all cases including signaling
15622 // nans.
15623 // FIXME: Should this be allowing -0.0?
15624 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15625 }
15626
15627 const MachineFunction &MF = DAG.getMachineFunction();
15628 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15629
15630 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15631 // handling no dx10-clamp?
15632 if (Info->getMode().DX10Clamp) {
15633 // If NaNs is clamped to 0, we are free to reorder the inputs.
15634
15635 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15636 std::swap(Src0, Src1);
15637
15638 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15639 std::swap(Src1, Src2);
15640
15641 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15642 std::swap(Src0, Src1);
15643
15644 if (isClampZeroToOne(Src1, Src2))
15645 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15646 }
15647
15648 return SDValue();
15649}
15650
15651SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15652 DAGCombinerInfo &DCI) const {
15653 SDValue Src0 = N->getOperand(0);
15654 SDValue Src1 = N->getOperand(1);
15655 if (Src0.isUndef() && Src1.isUndef())
15656 return DCI.DAG.getUNDEF(N->getValueType(0));
15657 return SDValue();
15658}
15659
15660// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15661// expanded into a set of cmp/select instructions.
15663 unsigned NumElem,
15664 bool IsDivergentIdx,
15665 const GCNSubtarget *Subtarget) {
15667 return false;
15668
15669 unsigned VecSize = EltSize * NumElem;
15670
15671 // Sub-dword vectors of size 2 dword or less have better implementation.
15672 if (VecSize <= 64 && EltSize < 32)
15673 return false;
15674
15675 // Always expand the rest of sub-dword instructions, otherwise it will be
15676 // lowered via memory.
15677 if (EltSize < 32)
15678 return true;
15679
15680 // Always do this if var-idx is divergent, otherwise it will become a loop.
15681 if (IsDivergentIdx)
15682 return true;
15683
15684 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15685 unsigned NumInsts = NumElem /* Number of compares */ +
15686 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15687
15688 // On some architectures (GFX9) movrel is not available and it's better
15689 // to expand.
15690 if (Subtarget->useVGPRIndexMode())
15691 return NumInsts <= 16;
15692
15693 // If movrel is available, use it instead of expanding for vector of 8
15694 // elements.
15695 if (Subtarget->hasMovrel())
15696 return NumInsts <= 15;
15697
15698 return true;
15699}
15700
15702 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15703 if (isa<ConstantSDNode>(Idx))
15704 return false;
15705
15706 SDValue Vec = N->getOperand(0);
15707 EVT VecVT = Vec.getValueType();
15708 EVT EltVT = VecVT.getVectorElementType();
15709 unsigned EltSize = EltVT.getSizeInBits();
15710 unsigned NumElem = VecVT.getVectorNumElements();
15711
15713 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15714}
15715
15716SDValue
15717SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15718 DAGCombinerInfo &DCI) const {
15719 SDValue Vec = N->getOperand(0);
15720 SelectionDAG &DAG = DCI.DAG;
15721
15722 EVT VecVT = Vec.getValueType();
15723 EVT VecEltVT = VecVT.getVectorElementType();
15724 EVT ResVT = N->getValueType(0);
15725
15726 unsigned VecSize = VecVT.getSizeInBits();
15727 unsigned VecEltSize = VecEltVT.getSizeInBits();
15728
15729 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15731 SDLoc SL(N);
15732 SDValue Idx = N->getOperand(1);
15733 SDValue Elt =
15734 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15735 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15736 }
15737
15738 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15739 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15740 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15741 // depending on the shift operand. See e.g. performSraCombine().
15742 // This combine ensures that the optimisation is compatible with v2i32
15743 // legalised AND.
15744 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15745 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15746
15748 if (!C || C->getZExtValue() != 0x1f)
15749 return SDValue();
15750
15751 SDLoc SL(N);
15752 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15753 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15754 Vec->getOperand(0), N->getOperand(1));
15755 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15756 DAG.ReplaceAllUsesWith(N, A.getNode());
15757 }
15758
15759 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15760 // =>
15761 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15762 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15763 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15764 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15765 SDLoc SL(N);
15766 SDValue Idx = N->getOperand(1);
15767 unsigned Opc = Vec.getOpcode();
15768
15769 switch (Opc) {
15770 default:
15771 break;
15772 // TODO: Support other binary operations.
15773 case ISD::FADD:
15774 case ISD::FSUB:
15775 case ISD::FMUL:
15776 case ISD::ADD:
15777 case ISD::UMIN:
15778 case ISD::UMAX:
15779 case ISD::SMIN:
15780 case ISD::SMAX:
15781 case ISD::FMAXNUM:
15782 case ISD::FMINNUM:
15783 case ISD::FMAXNUM_IEEE:
15784 case ISD::FMINNUM_IEEE:
15785 case ISD::FMAXIMUM:
15786 case ISD::FMINIMUM: {
15787 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15788 Vec.getOperand(0), Idx);
15789 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15790 Vec.getOperand(1), Idx);
15791
15792 DCI.AddToWorklist(Elt0.getNode());
15793 DCI.AddToWorklist(Elt1.getNode());
15794 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15795 }
15796 }
15797 }
15798
15799 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15801 SDLoc SL(N);
15802 SDValue Idx = N->getOperand(1);
15803 SDValue V;
15804 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15805 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15806 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15807 if (I == 0)
15808 V = Elt;
15809 else
15810 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15811 }
15812 return V;
15813 }
15814
15815 // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
15816 // =>
15817 // i32:Lo(k) if Idx == 0, or
15818 // i32:Hi(k) if Idx == 1
15819 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15820 if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
15821 SDLoc SL(N);
15822 SDValue PeekThrough = Vec.getOperand(0);
15823 auto *KImm = dyn_cast<ConstantSDNode>(PeekThrough);
15824 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
15825 uint64_t KImmValue = KImm->getZExtValue();
15826 return DAG.getConstant(
15827 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
15828 }
15829 auto *KFPImm = dyn_cast<ConstantFPSDNode>(PeekThrough);
15830 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
15831 uint64_t KFPImmValue =
15832 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
15833 return DAG.getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
15834 0xffffffff,
15835 SL, MVT::i32);
15836 }
15837 }
15838
15839 if (!DCI.isBeforeLegalize())
15840 return SDValue();
15841
15842 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15843 // elements. This exposes more load reduction opportunities by replacing
15844 // multiple small extract_vector_elements with a single 32-bit extract.
15845 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15846 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15847 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15848
15849 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15850 unsigned EltIdx = BitIndex / 32;
15851 unsigned LeftoverBitIdx = BitIndex % 32;
15852 SDLoc SL(N);
15853
15854 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15855 DCI.AddToWorklist(Cast.getNode());
15856
15857 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15858 DAG.getConstant(EltIdx, SL, MVT::i32));
15859 DCI.AddToWorklist(Elt.getNode());
15860 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15861 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15862 DCI.AddToWorklist(Srl.getNode());
15863
15864 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15865 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15866 DCI.AddToWorklist(Trunc.getNode());
15867
15868 if (VecEltVT == ResVT) {
15869 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15870 }
15871
15872 assert(ResVT.isScalarInteger());
15873 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15874 }
15875
15876 return SDValue();
15877}
15878
15879SDValue
15880SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15881 DAGCombinerInfo &DCI) const {
15882 SDValue Vec = N->getOperand(0);
15883 SDValue Idx = N->getOperand(2);
15884 EVT VecVT = Vec.getValueType();
15885 EVT EltVT = VecVT.getVectorElementType();
15886
15887 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15888 // => BUILD_VECTOR n x select (e, const-idx)
15890 return SDValue();
15891
15892 SelectionDAG &DAG = DCI.DAG;
15893 SDLoc SL(N);
15894 SDValue Ins = N->getOperand(1);
15895 EVT IdxVT = Idx.getValueType();
15896
15898 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15899 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15900 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15901 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15902 Ops.push_back(V);
15903 }
15904
15905 return DAG.getBuildVector(VecVT, SL, Ops);
15906}
15907
15908/// Return the source of an fp_extend from f16 to f32, or a converted FP
15909/// constant.
15911 if (Src.getOpcode() == ISD::FP_EXTEND &&
15912 Src.getOperand(0).getValueType() == MVT::f16) {
15913 return Src.getOperand(0);
15914 }
15915
15916 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15917 APFloat Val = CFP->getValueAPF();
15918 bool LosesInfo = true;
15920 if (!LosesInfo)
15921 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15922 }
15923
15924 return SDValue();
15925}
15926
15927SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15928 DAGCombinerInfo &DCI) const {
15929 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15930 "combine only useful on gfx8");
15931
15932 SDValue TruncSrc = N->getOperand(0);
15933 EVT VT = N->getValueType(0);
15934 if (VT != MVT::f16)
15935 return SDValue();
15936
15937 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15938 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15939 return SDValue();
15940
15941 SelectionDAG &DAG = DCI.DAG;
15942 SDLoc SL(N);
15943
15944 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15945 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15946 // casting back.
15947
15948 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15949 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15950 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15951 if (!A)
15952 return SDValue();
15953
15954 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15955 if (!B)
15956 return SDValue();
15957
15958 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15959 if (!C)
15960 return SDValue();
15961
15962 // This changes signaling nan behavior. If an input is a signaling nan, it
15963 // would have been quieted by the fpext originally. We don't care because
15964 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15965 // we would be worse off than just doing the promotion.
15966 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15967 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15968 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15969 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15970}
15971
15972unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15973 const SDNode *N0,
15974 const SDNode *N1) const {
15975 EVT VT = N0->getValueType(0);
15976
15977 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15978 // support denormals ever.
15979 if (((VT == MVT::f32 &&
15981 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15984 return ISD::FMAD;
15985
15986 const TargetOptions &Options = DAG.getTarget().Options;
15987 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15988 (N0->getFlags().hasAllowContract() &&
15989 N1->getFlags().hasAllowContract())) &&
15991 return ISD::FMA;
15992 }
15993
15994 return 0;
15995}
15996
15997// For a reassociatable opcode perform:
15998// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15999SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
16000 SelectionDAG &DAG) const {
16001 EVT VT = N->getValueType(0);
16002 if (VT != MVT::i32 && VT != MVT::i64)
16003 return SDValue();
16004
16005 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
16006 return SDValue();
16007
16008 unsigned Opc = N->getOpcode();
16009 SDValue Op0 = N->getOperand(0);
16010 SDValue Op1 = N->getOperand(1);
16011
16012 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
16013 return SDValue();
16014
16015 if (Op0->isDivergent())
16016 std::swap(Op0, Op1);
16017
16018 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
16019 return SDValue();
16020
16021 SDValue Op2 = Op1.getOperand(1);
16022 Op1 = Op1.getOperand(0);
16023 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
16024 return SDValue();
16025
16026 if (Op1->isDivergent())
16027 std::swap(Op1, Op2);
16028
16029 SDLoc SL(N);
16030 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
16031 return DAG.getNode(Opc, SL, VT, Add1, Op2);
16032}
16033
16034static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
16035 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
16037 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
16038 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
16039 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
16040}
16041
16042// Fold
16043// y = lshr i64 x, 32
16044// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
16045// with Const.hi == -1
16046// To
16047// res = mad_u64_u32 y.lo ,Const.lo, x.lo
16049 SDValue MulLHS, SDValue MulRHS,
16050 SDValue AddRHS) {
16051 if (MulRHS.getOpcode() == ISD::SRL)
16052 std::swap(MulLHS, MulRHS);
16053
16054 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
16055 return SDValue();
16056
16057 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
16058 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
16059 MulLHS.getOperand(0) != AddRHS)
16060 return SDValue();
16061
16063 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
16064 return SDValue();
16065
16066 SDValue ConstMul =
16067 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
16068 return getMad64_32(DAG, SL, MVT::i64,
16069 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
16070 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
16071}
16072
16073// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
16074// multiplies, if any.
16075//
16076// Full 64-bit multiplies that feed into an addition are lowered here instead
16077// of using the generic expansion. The generic expansion ends up with
16078// a tree of ADD nodes that prevents us from using the "add" part of the
16079// MAD instruction. The expansion produced here results in a chain of ADDs
16080// instead of a tree.
16081SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
16082 DAGCombinerInfo &DCI) const {
16083 assert(N->isAnyAdd());
16084
16085 SelectionDAG &DAG = DCI.DAG;
16086 EVT VT = N->getValueType(0);
16087 SDLoc SL(N);
16088 SDValue LHS = N->getOperand(0);
16089 SDValue RHS = N->getOperand(1);
16090
16091 if (VT.isVector())
16092 return SDValue();
16093
16094 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16095 // result in scalar registers for uniform values.
16096 if (!N->isDivergent() && Subtarget->hasSMulHi())
16097 return SDValue();
16098
16099 unsigned NumBits = VT.getScalarSizeInBits();
16100 if (NumBits <= 32 || NumBits > 64)
16101 return SDValue();
16102
16103 if (LHS.getOpcode() != ISD::MUL) {
16104 assert(RHS.getOpcode() == ISD::MUL);
16105 std::swap(LHS, RHS);
16106 }
16107
16108 // Avoid the fold if it would unduly increase the number of multiplies due to
16109 // multiple uses, except on hardware with full-rate multiply-add (which is
16110 // part of full-rate 64-bit ops).
16111 if (!Subtarget->hasFullRate64Ops()) {
16112 unsigned NumUsers = 0;
16113 for (SDNode *User : LHS->users()) {
16114 // There is a use that does not feed into addition, so the multiply can't
16115 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16116 if (!User->isAnyAdd())
16117 return SDValue();
16118
16119 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16120 // MUL + 3xADD + 3xADDC over 3xMAD.
16121 ++NumUsers;
16122 if (NumUsers >= 3)
16123 return SDValue();
16124 }
16125 }
16126
16127 SDValue MulLHS = LHS.getOperand(0);
16128 SDValue MulRHS = LHS.getOperand(1);
16129 SDValue AddRHS = RHS;
16130
16131 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16132 return FoldedMAD;
16133
16134 // Always check whether operands are small unsigned values, since that
16135 // knowledge is useful in more cases. Check for small signed values only if
16136 // doing so can unlock a shorter code sequence.
16137 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
16138 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
16139
16140 bool MulSignedLo = false;
16141 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16142 MulSignedLo =
16143 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
16144 }
16145
16146 // The operands and final result all have the same number of bits. If
16147 // operands need to be extended, they can be extended with garbage. The
16148 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
16149 // truncated away in the end.
16150 if (VT != MVT::i64) {
16151 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
16152 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
16153 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
16154 }
16155
16156 // The basic code generated is conceptually straightforward. Pseudo code:
16157 //
16158 // accum = mad_64_32 lhs.lo, rhs.lo, accum
16159 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
16160 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
16161 //
16162 // The second and third lines are optional, depending on whether the factors
16163 // are {sign,zero}-extended or not.
16164 //
16165 // The actual DAG is noisier than the pseudo code, but only due to
16166 // instructions that disassemble values into low and high parts, and
16167 // assemble the final result.
16168 SDValue One = DAG.getConstant(1, SL, MVT::i32);
16169
16170 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
16171 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
16172 SDValue Accum =
16173 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16174
16175 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16176 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16177
16178 if (!MulLHSUnsigned32) {
16179 auto MulLHSHi =
16180 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
16181 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
16182 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16183 }
16184
16185 if (!MulRHSUnsigned32) {
16186 auto MulRHSHi =
16187 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
16188 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
16189 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16190 }
16191
16192 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
16193 Accum = DAG.getBitcast(MVT::i64, Accum);
16194 }
16195
16196 if (VT != MVT::i64)
16197 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
16198 return Accum;
16199}
16200
16201SDValue
16202SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
16203 DAGCombinerInfo &DCI) const {
16204 SDValue RHS = N->getOperand(1);
16205 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16206 if (!CRHS)
16207 return SDValue();
16208
16209 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
16210 // common.
16211 uint64_t Val = CRHS->getZExtValue();
16212 if (countr_zero(Val) >= 32) {
16213 SelectionDAG &DAG = DCI.DAG;
16214 SDLoc SL(N);
16215 SDValue LHS = N->getOperand(0);
16216
16217 // Avoid carry machinery if we know the low half of the add does not
16218 // contribute to the final result.
16219 //
16220 // add i64:x, K if computeTrailingZeros(K) >= 32
16221 // => build_pair (add x.hi, K.hi), x.lo
16222
16223 // Breaking the 64-bit add here with this strange constant is unlikely
16224 // to interfere with addressing mode patterns.
16225
16226 SDValue Hi = getHiHalf64(LHS, DAG);
16227 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
16228 unsigned Opcode = N->getOpcode();
16229 if (Opcode == ISD::PTRADD)
16230 Opcode = ISD::ADD;
16231 SDValue AddHi =
16232 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
16233
16234 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
16235 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
16236 }
16237
16238 return SDValue();
16239}
16240
16241// Collect the ultimate src of each of the mul node's operands, and confirm
16242// each operand is 8 bytes.
16243static std::optional<ByteProvider<SDValue>>
16244handleMulOperand(const SDValue &MulOperand) {
16245 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
16246 if (!Byte0 || Byte0->isConstantZero()) {
16247 return std::nullopt;
16248 }
16249 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
16250 if (Byte1 && !Byte1->isConstantZero()) {
16251 return std::nullopt;
16252 }
16253 return Byte0;
16254}
16255
16256static unsigned addPermMasks(unsigned First, unsigned Second) {
16257 unsigned FirstCs = First & 0x0c0c0c0c;
16258 unsigned SecondCs = Second & 0x0c0c0c0c;
16259 unsigned FirstNoCs = First & ~0x0c0c0c0c;
16260 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16261
16262 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16263 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16264 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16265 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16266
16267 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16268}
16269
16270struct DotSrc {
16272 int64_t PermMask;
16274};
16275
16279 SmallVectorImpl<DotSrc> &Src1s, int Step) {
16280
16281 assert(Src0.Src.has_value() && Src1.Src.has_value());
16282 // Src0s and Src1s are empty, just place arbitrarily.
16283 if (Step == 0) {
16284 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
16285 Src0.SrcOffset / 4});
16286 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
16287 Src1.SrcOffset / 4});
16288 return;
16289 }
16290
16291 for (int BPI = 0; BPI < 2; BPI++) {
16292 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
16293 if (BPI == 1) {
16294 BPP = {Src1, Src0};
16295 }
16296 unsigned ZeroMask = 0x0c0c0c0c;
16297 unsigned FMask = 0xFF << (8 * (3 - Step));
16298
16299 unsigned FirstMask =
16300 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16301 unsigned SecondMask =
16302 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16303 // Attempt to find Src vector which contains our SDValue, if so, add our
16304 // perm mask to the existing one. If we are unable to find a match for the
16305 // first SDValue, attempt to find match for the second.
16306 int FirstGroup = -1;
16307 for (int I = 0; I < 2; I++) {
16308 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
16309 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
16310 return IterElt.SrcOp == *BPP.first.Src &&
16311 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16312 };
16313
16314 auto *Match = llvm::find_if(Srcs, MatchesFirst);
16315 if (Match != Srcs.end()) {
16316 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
16317 FirstGroup = I;
16318 break;
16319 }
16320 }
16321 if (FirstGroup != -1) {
16322 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
16323 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
16324 return IterElt.SrcOp == *BPP.second.Src &&
16325 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16326 };
16327 auto *Match = llvm::find_if(Srcs, MatchesSecond);
16328 if (Match != Srcs.end()) {
16329 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
16330 } else
16331 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16332 return;
16333 }
16334 }
16335
16336 // If we have made it here, then we could not find a match in Src0s or Src1s
16337 // for either Src0 or Src1, so just place them arbitrarily.
16338
16339 unsigned ZeroMask = 0x0c0c0c0c;
16340 unsigned FMask = 0xFF << (8 * (3 - Step));
16341
16342 Src0s.push_back(
16343 {*Src0.Src,
16344 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16345 Src0.SrcOffset / 4});
16346 Src1s.push_back(
16347 {*Src1.Src,
16348 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16349 Src1.SrcOffset / 4});
16350}
16351
16353 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
16354 bool IsAny) {
16355
16356 // If we just have one source, just permute it accordingly.
16357 if (Srcs.size() == 1) {
16358 auto *Elt = Srcs.begin();
16359 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
16360
16361 // v_perm will produce the original value
16362 if (Elt->PermMask == 0x3020100)
16363 return EltOp;
16364
16365 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16366 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
16367 }
16368
16369 auto *FirstElt = Srcs.begin();
16370 auto *SecondElt = std::next(FirstElt);
16371
16373
16374 // If we have multiple sources in the chain, combine them via perms (using
16375 // calculated perm mask) and Ors.
16376 while (true) {
16377 auto FirstMask = FirstElt->PermMask;
16378 auto SecondMask = SecondElt->PermMask;
16379
16380 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16381 unsigned FirstPlusFour = FirstMask | 0x04040404;
16382 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16383 // original 0x0C.
16384 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16385
16386 auto PermMask = addPermMasks(FirstMask, SecondMask);
16387 auto FirstVal =
16388 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16389 auto SecondVal =
16390 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
16391
16392 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
16393 SecondVal,
16394 DAG.getConstant(PermMask, SL, MVT::i32)));
16395
16396 FirstElt = std::next(SecondElt);
16397 if (FirstElt == Srcs.end())
16398 break;
16399
16400 SecondElt = std::next(FirstElt);
16401 // If we only have a FirstElt, then just combine that into the cumulative
16402 // source node.
16403 if (SecondElt == Srcs.end()) {
16404 auto EltOp =
16405 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16406
16407 Perms.push_back(
16408 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16409 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
16410 break;
16411 }
16412 }
16413
16414 assert(Perms.size() == 1 || Perms.size() == 2);
16415 return Perms.size() == 2
16416 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
16417 : Perms[0];
16418}
16419
16420static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16421 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16422 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16423 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16424 EntryMask += ZeroMask;
16425 }
16426}
16427
16428static bool isMul(const SDValue Op) {
16429 auto Opcode = Op.getOpcode();
16430
16431 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16432 Opcode == AMDGPUISD::MUL_I24);
16433}
16434
16435static std::optional<bool>
16437 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16438 const SDValue &S1Op, const SelectionDAG &DAG) {
16439 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16440 // of the dot4 is irrelevant.
16441 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16442 return false;
16443
16444 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16445 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16446 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16447 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16448 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16449 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16450
16451 assert(!(S0IsUnsigned && S0IsSigned));
16452 assert(!(S1IsUnsigned && S1IsSigned));
16453
16454 // There are 9 possible permutations of
16455 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16456
16457 // In two permutations, the sign bits are known to be the same for both Ops,
16458 // so simply return Signed / Unsigned corresponding to the MSB
16459
16460 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16461 return S0IsSigned;
16462
16463 // In another two permutations, the sign bits are known to be opposite. In
16464 // this case return std::nullopt to indicate a bad match.
16465
16466 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16467 return std::nullopt;
16468
16469 // In the remaining five permutations, we don't know the value of the sign
16470 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16471 // the upper bits must be extension bits. Thus, the only ways for the sign
16472 // bit to be unknown is if it was sign extended from unknown value, or if it
16473 // was any extended. In either case, it is correct to use the signed
16474 // version of the signedness semantics of dot4
16475
16476 // In two of such permutations, we known the sign bit is set for
16477 // one op, and the other is unknown. It is okay to used signed version of
16478 // dot4.
16479 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16480 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16481 return true;
16482
16483 // In one such permutation, we don't know either of the sign bits. It is okay
16484 // to used the signed version of dot4.
16485 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16486 return true;
16487
16488 // In two of such permutations, we known the sign bit is unset for
16489 // one op, and the other is unknown. Return std::nullopt to indicate a
16490 // bad match.
16491 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16492 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16493 return std::nullopt;
16494
16495 llvm_unreachable("Fully covered condition");
16496}
16497
16498SDValue SITargetLowering::performAddCombine(SDNode *N,
16499 DAGCombinerInfo &DCI) const {
16500 SelectionDAG &DAG = DCI.DAG;
16501 EVT VT = N->getValueType(0);
16502 SDLoc SL(N);
16503 SDValue LHS = N->getOperand(0);
16504 SDValue RHS = N->getOperand(1);
16505
16506 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16507 if (Subtarget->hasMad64_32()) {
16508 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16509 return Folded;
16510 }
16511 }
16512
16513 if (SDValue V = reassociateScalarOps(N, DAG)) {
16514 return V;
16515 }
16516
16517 if (VT == MVT::i64) {
16518 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16519 return Folded;
16520 }
16521
16522 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16523 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16524 SDValue TempNode(N, 0);
16525 std::optional<bool> IsSigned;
16529
16530 // Match the v_dot4 tree, while collecting src nodes.
16531 int ChainLength = 0;
16532 for (int I = 0; I < 4; I++) {
16533 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16534 if (MulIdx == -1)
16535 break;
16536 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16537 if (!Src0)
16538 break;
16539 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16540 if (!Src1)
16541 break;
16542
16543 auto IterIsSigned = checkDot4MulSignedness(
16544 TempNode->getOperand(MulIdx), *Src0, *Src1,
16545 TempNode->getOperand(MulIdx)->getOperand(0),
16546 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16547 if (!IterIsSigned)
16548 break;
16549 if (!IsSigned)
16550 IsSigned = *IterIsSigned;
16551 if (*IterIsSigned != *IsSigned)
16552 break;
16553 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16554 auto AddIdx = 1 - MulIdx;
16555 // Allow the special case where add (add (mul24, 0), mul24) became ->
16556 // add (mul24, mul24).
16557 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16558 Src2s.push_back(TempNode->getOperand(AddIdx));
16559 auto Src0 =
16560 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16561 if (!Src0)
16562 break;
16563 auto Src1 =
16564 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16565 if (!Src1)
16566 break;
16567 auto IterIsSigned = checkDot4MulSignedness(
16568 TempNode->getOperand(AddIdx), *Src0, *Src1,
16569 TempNode->getOperand(AddIdx)->getOperand(0),
16570 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16571 if (!IterIsSigned)
16572 break;
16573 assert(IsSigned);
16574 if (*IterIsSigned != *IsSigned)
16575 break;
16576 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16577 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16578 ChainLength = I + 2;
16579 break;
16580 }
16581
16582 TempNode = TempNode->getOperand(AddIdx);
16583 Src2s.push_back(TempNode);
16584 ChainLength = I + 1;
16585 if (TempNode->getNumOperands() < 2)
16586 break;
16587 LHS = TempNode->getOperand(0);
16588 RHS = TempNode->getOperand(1);
16589 }
16590
16591 if (ChainLength < 2)
16592 return SDValue();
16593
16594 // Masks were constructed with assumption that we would find a chain of
16595 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16596 // 0x0c) so they do not affect dot calculation.
16597 if (ChainLength < 4) {
16598 fixMasks(Src0s, ChainLength);
16599 fixMasks(Src1s, ChainLength);
16600 }
16601
16602 SDValue Src0, Src1;
16603
16604 // If we are just using a single source for both, and have permuted the
16605 // bytes consistently, we can just use the sources without permuting
16606 // (commutation).
16607 bool UseOriginalSrc = false;
16608 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16609 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16610 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16611 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16612 SmallVector<unsigned, 4> SrcBytes;
16613 auto Src0Mask = Src0s.begin()->PermMask;
16614 SrcBytes.push_back(Src0Mask & 0xFF000000);
16615 bool UniqueEntries = true;
16616 for (auto I = 1; I < 4; I++) {
16617 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16618
16619 if (is_contained(SrcBytes, NextByte)) {
16620 UniqueEntries = false;
16621 break;
16622 }
16623 SrcBytes.push_back(NextByte);
16624 }
16625
16626 if (UniqueEntries) {
16627 UseOriginalSrc = true;
16628
16629 auto *FirstElt = Src0s.begin();
16630 auto FirstEltOp =
16631 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16632
16633 auto *SecondElt = Src1s.begin();
16634 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16635 SecondElt->DWordOffset);
16636
16637 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16638 MVT::getIntegerVT(32));
16639 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16640 MVT::getIntegerVT(32));
16641 }
16642 }
16643
16644 if (!UseOriginalSrc) {
16645 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16646 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16647 }
16648
16649 assert(IsSigned);
16650 SDValue Src2 =
16651 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16652
16653 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16654 : Intrinsic::amdgcn_udot4,
16655 SL, MVT::i64);
16656
16657 assert(!VT.isVector());
16658 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16659 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16660
16661 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16662 }
16663
16664 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16665 return SDValue();
16666
16667 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16668 // add x, sext (setcc) => usubo_carry x, 0, setcc
16669 unsigned Opc = LHS.getOpcode();
16672 std::swap(RHS, LHS);
16673
16674 Opc = RHS.getOpcode();
16675 switch (Opc) {
16676 default:
16677 break;
16678 case ISD::ZERO_EXTEND:
16679 case ISD::SIGN_EXTEND:
16680 case ISD::ANY_EXTEND: {
16681 auto Cond = RHS.getOperand(0);
16682 // If this won't be a real VOPC output, we would still need to insert an
16683 // extra instruction anyway.
16684 if (!isBoolSGPR(Cond))
16685 break;
16686 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16687 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16689 return DAG.getNode(Opc, SL, VTList, Args);
16690 }
16691 case ISD::UADDO_CARRY: {
16692 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16693 if (!isNullConstant(RHS.getOperand(1)))
16694 break;
16695 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16696 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16697 }
16698 }
16699 return SDValue();
16700}
16701
16702SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16703 DAGCombinerInfo &DCI) const {
16704 SelectionDAG &DAG = DCI.DAG;
16705 SDLoc DL(N);
16706 EVT VT = N->getValueType(0);
16707 SDValue N0 = N->getOperand(0);
16708 SDValue N1 = N->getOperand(1);
16709
16710 // The following folds transform PTRADDs into regular arithmetic in cases
16711 // where the PTRADD wouldn't be folded as an immediate offset into memory
16712 // instructions anyway. They are target-specific in that other targets might
16713 // prefer to not lose information about the pointer arithmetic.
16714
16715 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16716 // Adapted from DAGCombiner::visitADDLikeCommutative.
16717 SDValue V, K;
16718 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16719 SDNodeFlags ShlFlags = N1->getFlags();
16720 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16721 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16722 // preserved.
16723 SDNodeFlags NewShlFlags =
16724 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16726 : SDNodeFlags();
16727 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16728 DCI.AddToWorklist(Inner.getNode());
16729 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16730 }
16731
16732 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16733 // performAddCombine.
16734 if (N1.getOpcode() == ISD::MUL) {
16735 if (Subtarget->hasMad64_32()) {
16736 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16737 return Folded;
16738 }
16739 }
16740
16741 // If the 32 low bits of the constant are all zero, there is nothing to fold
16742 // into an immediate offset, so it's better to eliminate the unnecessary
16743 // addition for the lower 32 bits than to preserve the PTRADD.
16744 // Analogous to a fold in performAddCombine.
16745 if (VT == MVT::i64) {
16746 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16747 return Folded;
16748 }
16749
16750 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16751 return SDValue();
16752
16753 SDValue X = N0;
16754 SDValue Y = N1.getOperand(0);
16755 SDValue Z = N1.getOperand(1);
16756 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16757 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16758
16759 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16760 Y->isDivergent() != Z->isDivergent()) {
16761 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16762 // y are uniform and z isn't.
16763 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16764 // z are uniform and y isn't.
16765 // The goal is to push uniform operands up in the computation, so that they
16766 // can be handled with scalar operations. We can't use reassociateScalarOps
16767 // for this since it requires two identical commutative operations to
16768 // reassociate.
16769 if (Y->isDivergent())
16770 std::swap(Y, Z);
16771 // If both additions in the original were NUW, reassociation preserves that.
16772 SDNodeFlags ReassocFlags =
16773 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16774 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16775 DCI.AddToWorklist(UniformInner.getNode());
16776 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16777 }
16778
16779 return SDValue();
16780}
16781
16782SDValue SITargetLowering::performSubCombine(SDNode *N,
16783 DAGCombinerInfo &DCI) const {
16784 SelectionDAG &DAG = DCI.DAG;
16785 EVT VT = N->getValueType(0);
16786
16787 if (VT == MVT::i64) {
16788 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16789 return Folded;
16790 }
16791
16792 if (VT != MVT::i32)
16793 return SDValue();
16794
16795 SDLoc SL(N);
16796 SDValue LHS = N->getOperand(0);
16797 SDValue RHS = N->getOperand(1);
16798
16799 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16800 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16801 unsigned Opc = RHS.getOpcode();
16802 switch (Opc) {
16803 default:
16804 break;
16805 case ISD::ZERO_EXTEND:
16806 case ISD::SIGN_EXTEND:
16807 case ISD::ANY_EXTEND: {
16808 auto Cond = RHS.getOperand(0);
16809 // If this won't be a real VOPC output, we would still need to insert an
16810 // extra instruction anyway.
16811 if (!isBoolSGPR(Cond))
16812 break;
16813 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16814 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16816 return DAG.getNode(Opc, SL, VTList, Args);
16817 }
16818 }
16819
16820 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16821 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16822 if (!isNullConstant(LHS.getOperand(1)))
16823 return SDValue();
16824 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16825 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16826 }
16827 return SDValue();
16828}
16829
16830SDValue
16831SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16832 DAGCombinerInfo &DCI) const {
16833
16834 if (N->getValueType(0) != MVT::i32)
16835 return SDValue();
16836
16837 if (!isNullConstant(N->getOperand(1)))
16838 return SDValue();
16839
16840 SelectionDAG &DAG = DCI.DAG;
16841 SDValue LHS = N->getOperand(0);
16842
16843 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16844 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16845 unsigned LHSOpc = LHS.getOpcode();
16846 unsigned Opc = N->getOpcode();
16847 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16848 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16849 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16850 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16851 }
16852 return SDValue();
16853}
16854
16855SDValue SITargetLowering::performFAddCombine(SDNode *N,
16856 DAGCombinerInfo &DCI) const {
16857 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16858 return SDValue();
16859
16860 SelectionDAG &DAG = DCI.DAG;
16861 EVT VT = N->getValueType(0);
16862
16863 SDLoc SL(N);
16864 SDValue LHS = N->getOperand(0);
16865 SDValue RHS = N->getOperand(1);
16866
16867 // These should really be instruction patterns, but writing patterns with
16868 // source modifiers is a pain.
16869
16870 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16871 if (LHS.getOpcode() == ISD::FADD) {
16872 SDValue A = LHS.getOperand(0);
16873 if (A == LHS.getOperand(1)) {
16874 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16875 if (FusedOp != 0) {
16876 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16877 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16878 }
16879 }
16880 }
16881
16882 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16883 if (RHS.getOpcode() == ISD::FADD) {
16884 SDValue A = RHS.getOperand(0);
16885 if (A == RHS.getOperand(1)) {
16886 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16887 if (FusedOp != 0) {
16888 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16889 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16890 }
16891 }
16892 }
16893
16894 return SDValue();
16895}
16896
16897SDValue SITargetLowering::performFSubCombine(SDNode *N,
16898 DAGCombinerInfo &DCI) const {
16899 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16900 return SDValue();
16901
16902 SelectionDAG &DAG = DCI.DAG;
16903 SDLoc SL(N);
16904 EVT VT = N->getValueType(0);
16905 assert(!VT.isVector());
16906
16907 // Try to get the fneg to fold into the source modifier. This undoes generic
16908 // DAG combines and folds them into the mad.
16909 //
16910 // Only do this if we are not trying to support denormals. v_mad_f32 does
16911 // not support denormals ever.
16912 SDValue LHS = N->getOperand(0);
16913 SDValue RHS = N->getOperand(1);
16914 if (LHS.getOpcode() == ISD::FADD) {
16915 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16916 SDValue A = LHS.getOperand(0);
16917 if (A == LHS.getOperand(1)) {
16918 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16919 if (FusedOp != 0) {
16920 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16921 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16922
16923 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16924 }
16925 }
16926 }
16927
16928 if (RHS.getOpcode() == ISD::FADD) {
16929 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16930
16931 SDValue A = RHS.getOperand(0);
16932 if (A == RHS.getOperand(1)) {
16933 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16934 if (FusedOp != 0) {
16935 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16936 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16937 }
16938 }
16939 }
16940
16941 return SDValue();
16942}
16943
16944SDValue SITargetLowering::performFDivCombine(SDNode *N,
16945 DAGCombinerInfo &DCI) const {
16946 SelectionDAG &DAG = DCI.DAG;
16947 SDLoc SL(N);
16948 EVT VT = N->getValueType(0);
16949
16950 // fsqrt legality correlates to rsq availability.
16951 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
16952 return SDValue();
16953
16954 SDValue LHS = N->getOperand(0);
16955 SDValue RHS = N->getOperand(1);
16956
16957 SDNodeFlags Flags = N->getFlags();
16958 SDNodeFlags RHSFlags = RHS->getFlags();
16959 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16960 !RHS->hasOneUse())
16961 return SDValue();
16962
16963 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16964 bool IsNegative = false;
16965 if (CLHS->isExactlyValue(1.0) ||
16966 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16967 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16968 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16969 if (RHS.getOpcode() == ISD::FSQRT) {
16970 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16971 SDValue Rsq =
16972 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16973 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16974 }
16975 }
16976 }
16977
16978 return SDValue();
16979}
16980
16981SDValue SITargetLowering::performFMulCombine(SDNode *N,
16982 DAGCombinerInfo &DCI) const {
16983 SelectionDAG &DAG = DCI.DAG;
16984 EVT VT = N->getValueType(0);
16985 EVT ScalarVT = VT.getScalarType();
16986 EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
16987
16988 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16989 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16990 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16991 return SDValue();
16992 }
16993
16994 SDValue LHS = N->getOperand(0);
16995 SDValue RHS = N->getOperand(1);
16996
16997 // It is cheaper to realize i32 inline constants as compared against
16998 // materializing f16 or f64 (or even non-inline f32) values,
16999 // possible via ldexp usage, as shown below :
17000 //
17001 // Given : A = 2^a & B = 2^b ; where a and b are integers.
17002 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
17003 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
17004 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17005 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
17006 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
17007 if (!TrueNode)
17008 return SDValue();
17009 const ConstantFPSDNode *FalseNode =
17010 isConstOrConstSplatFP(RHS.getOperand(2));
17011 if (!FalseNode)
17012 return SDValue();
17013
17014 if (TrueNode->isNegative() != FalseNode->isNegative())
17015 return SDValue();
17016
17017 // For f32, only non-inline constants should be transformed.
17018 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17019 if (ScalarVT == MVT::f32 &&
17020 TII->isInlineConstant(TrueNode->getValueAPF()) &&
17021 TII->isInlineConstant(FalseNode->getValueAPF()))
17022 return SDValue();
17023
17024 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
17025 if (TrueNodeExpVal == INT_MIN)
17026 return SDValue();
17027 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
17028 if (FalseNodeExpVal == INT_MIN)
17029 return SDValue();
17030
17031 SDLoc SL(N);
17032 SDValue SelectNode =
17033 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
17034 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
17035 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
17036
17037 LHS = TrueNode->isNegative()
17038 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
17039 : LHS;
17040
17041 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
17042 }
17043
17044 return SDValue();
17045}
17046
17047SDValue SITargetLowering::performFMACombine(SDNode *N,
17048 DAGCombinerInfo &DCI) const {
17049 SelectionDAG &DAG = DCI.DAG;
17050 EVT VT = N->getValueType(0);
17051 SDLoc SL(N);
17052
17053 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17054 return SDValue();
17055
17056 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
17057 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
17058 SDValue Op1 = N->getOperand(0);
17059 SDValue Op2 = N->getOperand(1);
17060 SDValue FMA = N->getOperand(2);
17061
17062 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
17063 Op2.getOpcode() != ISD::FP_EXTEND)
17064 return SDValue();
17065
17066 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
17067 // regardless of the denorm mode setting. Therefore,
17068 // fp-contract is sufficient to allow generating fdot2.
17069 const TargetOptions &Options = DAG.getTarget().Options;
17070 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17071 (N->getFlags().hasAllowContract() &&
17072 FMA->getFlags().hasAllowContract())) {
17073 Op1 = Op1.getOperand(0);
17074 Op2 = Op2.getOperand(0);
17075 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17077 return SDValue();
17078
17079 SDValue Vec1 = Op1.getOperand(0);
17080 SDValue Idx1 = Op1.getOperand(1);
17081 SDValue Vec2 = Op2.getOperand(0);
17082
17083 SDValue FMAOp1 = FMA.getOperand(0);
17084 SDValue FMAOp2 = FMA.getOperand(1);
17085 SDValue FMAAcc = FMA.getOperand(2);
17086
17087 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
17088 FMAOp2.getOpcode() != ISD::FP_EXTEND)
17089 return SDValue();
17090
17091 FMAOp1 = FMAOp1.getOperand(0);
17092 FMAOp2 = FMAOp2.getOperand(0);
17093 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17095 return SDValue();
17096
17097 SDValue Vec3 = FMAOp1.getOperand(0);
17098 SDValue Vec4 = FMAOp2.getOperand(0);
17099 SDValue Idx2 = FMAOp1.getOperand(1);
17100
17101 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
17102 // Idx1 and Idx2 cannot be the same.
17103 Idx1 == Idx2)
17104 return SDValue();
17105
17106 if (Vec1 == Vec2 || Vec3 == Vec4)
17107 return SDValue();
17108
17109 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
17110 return SDValue();
17111
17112 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17113 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
17114 DAG.getTargetConstant(0, SL, MVT::i1));
17115 }
17116 }
17117 return SDValue();
17118}
17119
17120SDValue SITargetLowering::performSetCCCombine(SDNode *N,
17121 DAGCombinerInfo &DCI) const {
17122 SelectionDAG &DAG = DCI.DAG;
17123 SDLoc SL(N);
17124
17125 SDValue LHS = N->getOperand(0);
17126 SDValue RHS = N->getOperand(1);
17127 EVT VT = LHS.getValueType();
17128 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
17129
17130 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
17131 if (!CRHS) {
17133 if (CRHS) {
17134 std::swap(LHS, RHS);
17135 CC = getSetCCSwappedOperands(CC);
17136 }
17137 }
17138
17139 if (CRHS) {
17140 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
17141 isBoolSGPR(LHS.getOperand(0))) {
17142 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
17143 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
17144 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
17145 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
17146 if ((CRHS->isAllOnes() &&
17147 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
17148 (CRHS->isZero() &&
17149 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
17150 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
17151 DAG.getAllOnesConstant(SL, MVT::i1));
17152 if ((CRHS->isAllOnes() &&
17153 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
17154 (CRHS->isZero() &&
17155 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
17156 return LHS.getOperand(0);
17157 }
17158
17159 const APInt &CRHSVal = CRHS->getAPIntValue();
17160 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
17161 LHS.getOpcode() == ISD::SELECT &&
17162 isa<ConstantSDNode>(LHS.getOperand(1)) &&
17163 isa<ConstantSDNode>(LHS.getOperand(2)) &&
17164 isBoolSGPR(LHS.getOperand(0))) {
17165 // Given CT != FT:
17166 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
17167 // setcc (select cc, CT, CF), CF, ne => cc
17168 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
17169 // setcc (select cc, CT, CF), CT, eq => cc
17170 const APInt &CT = LHS.getConstantOperandAPInt(1);
17171 const APInt &CF = LHS.getConstantOperandAPInt(2);
17172
17173 if (CT != CF) {
17174 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
17175 (CT == CRHSVal && CC == ISD::SETNE))
17176 return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);
17177 if ((CF == CRHSVal && CC == ISD::SETNE) ||
17178 (CT == CRHSVal && CC == ISD::SETEQ))
17179 return LHS.getOperand(0);
17180 }
17181 }
17182 }
17183
17184 // Truncate 64-bit setcc to test only upper 32-bits of its operands in the
17185 // following cases where information about the lower 32-bits of its operands
17186 // is known:
17187 //
17188 // If LHS.lo32 == RHS.lo32:
17189 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
17190 // If LHS.lo32 != RHS.lo32:
17191 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
17192 // If LHS.lo32 >= RHS.lo32 (unsigned):
17193 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
17194 // If LHS.lo32 > RHS.lo32 (unsigned):
17195 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
17196 // If LHS.lo32 <= RHS.lo32 (unsigned):
17197 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
17198 // If LHS.lo32 < RHS.lo32 (unsigned):
17199 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
17200 if (VT == MVT::i64) {
17201 const KnownBits LHSKnownLo32 = DAG.computeKnownBits(LHS).trunc(32);
17202 const KnownBits RHSKnownLo32 = DAG.computeKnownBits(RHS).trunc(32);
17203
17204 // NewCC is valid iff we can truncate the setcc to only test the upper 32
17205 // bits
17207
17208 switch (CC) {
17209 default:
17210 break;
17211 case ISD::SETEQ: {
17212 const std::optional<bool> KnownEq =
17213 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
17214 if (KnownEq)
17215 NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;
17216
17217 break;
17218 }
17219 case ISD::SETNE: {
17220 const std::optional<bool> KnownEq =
17221 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
17222 if (KnownEq)
17223 NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;
17224
17225 break;
17226 }
17227 case ISD::SETULT:
17228 case ISD::SETUGE:
17229 case ISD::SETLT:
17230 case ISD::SETGE: {
17231 const std::optional<bool> KnownUge =
17232 KnownBits::uge(LHSKnownLo32, RHSKnownLo32);
17233 if (KnownUge) {
17234 if (*KnownUge) {
17235 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
17236 NewCC = CC;
17237 } else {
17238 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
17239 NewCC = CC == ISD::SETULT ? ISD::SETULE
17240 : CC == ISD::SETUGE ? ISD::SETUGT
17241 : CC == ISD::SETLT ? ISD::SETLE
17242 : ISD::SETGT;
17243 }
17244 }
17245 break;
17246 }
17247 case ISD::SETULE:
17248 case ISD::SETUGT:
17249 case ISD::SETLE:
17250 case ISD::SETGT: {
17251 const std::optional<bool> KnownUle =
17252 KnownBits::ule(LHSKnownLo32, RHSKnownLo32);
17253 if (KnownUle) {
17254 if (*KnownUle) {
17255 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
17256 NewCC = CC;
17257 } else {
17258 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
17259 NewCC = CC == ISD::SETULE ? ISD::SETULT
17260 : CC == ISD::SETUGT ? ISD::SETUGE
17261 : CC == ISD::SETLE ? ISD::SETLT
17262 : ISD::SETGE;
17263 }
17264 }
17265 break;
17266 }
17267 }
17268
17269 if (NewCC != ISD::SETCC_INVALID)
17270 return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
17271 getHiHalf64(RHS, DAG), NewCC);
17272 }
17273
17274 // Eliminate setcc by using carryout from add/sub instruction
17275
17276 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
17277 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
17278 // similarly for subtraction
17279
17280 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
17281 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
17282
17283 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
17285 (CC == ISD::SETUGT &&
17287 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
17288 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
17289 bool IsAdd = LHS.getOpcode() == ISD::ADD;
17290
17291 SDValue Op0 = LHS.getOperand(0);
17292 SDValue Op1 = LHS.getOperand(1);
17293
17294 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
17295 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
17296
17297 SDValue Op0Hi = getHiHalf64(Op0, DAG);
17298 SDValue Op1Hi = getHiHalf64(Op1, DAG);
17299
17300 SDValue NodeLo =
17301 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
17302 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
17303
17304 SDValue CarryInHi = NodeLo.getValue(1);
17305 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
17306 SL, DAG.getVTList(MVT::i32, MVT::i1),
17307 {Op0Hi, Op1Hi, CarryInHi});
17308
17309 SDValue ResultLo = NodeLo.getValue(0);
17310 SDValue ResultHi = NodeHi.getValue(0);
17311
17312 SDValue JoinedResult =
17313 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
17314
17315 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
17316 SDValue Overflow = NodeHi.getValue(1);
17317 DCI.CombineTo(LHS.getNode(), Result);
17318 return Overflow;
17319 }
17320
17321 if (VT != MVT::f32 && VT != MVT::f64 &&
17322 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17323 return SDValue();
17324
17325 // Match isinf/isfinite pattern
17326 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
17327 // (fcmp one (fabs x), inf) -> (fp_class x,
17328 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
17329 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
17330 LHS.getOpcode() == ISD::FABS) {
17331 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
17332 if (!CRHS)
17333 return SDValue();
17334
17335 const APFloat &APF = CRHS->getValueAPF();
17336 if (APF.isInfinity() && !APF.isNegative()) {
17337 const unsigned IsInfMask =
17339 const unsigned IsFiniteMask =
17343 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
17344 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
17345 DAG.getConstant(Mask, SL, MVT::i32));
17346 }
17347 }
17348
17349 return SDValue();
17350}
17351
17352SDValue
17353SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
17354 DAGCombinerInfo &DCI) const {
17355 SelectionDAG &DAG = DCI.DAG;
17356 SDLoc SL(N);
17357 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17358
17359 SDValue Src = N->getOperand(0);
17360 SDValue Shift = N->getOperand(0);
17361
17362 // TODO: Extend type shouldn't matter (assuming legal types).
17363 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
17364 Shift = Shift.getOperand(0);
17365
17366 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
17367 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
17368 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
17369 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
17370 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
17371 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
17372 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
17373 SDValue Shifted = DAG.getZExtOrTrunc(
17374 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
17375
17376 unsigned ShiftOffset = 8 * Offset;
17377 if (Shift.getOpcode() == ISD::SHL)
17378 ShiftOffset -= C->getZExtValue();
17379 else
17380 ShiftOffset += C->getZExtValue();
17381
17382 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17383 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
17384 MVT::f32, Shifted);
17385 }
17386 }
17387 }
17388
17389 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17390 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
17391 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
17392 // We simplified Src. If this node is not dead, visit it again so it is
17393 // folded properly.
17394 if (N->getOpcode() != ISD::DELETED_NODE)
17395 DCI.AddToWorklist(N);
17396 return SDValue(N, 0);
17397 }
17398
17399 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
17400 if (SDValue DemandedSrc =
17401 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
17402 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
17403
17404 return SDValue();
17405}
17406
17407SDValue SITargetLowering::performClampCombine(SDNode *N,
17408 DAGCombinerInfo &DCI) const {
17409 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
17410 if (!CSrc)
17411 return SDValue();
17412
17413 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17414 const APFloat &F = CSrc->getValueAPF();
17415 APFloat Zero = APFloat::getZero(F.getSemantics());
17416 if (F < Zero ||
17417 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17418 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
17419 }
17420
17421 APFloat One(F.getSemantics(), "1.0");
17422 if (F > One)
17423 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
17424
17425 return SDValue(CSrc, 0);
17426}
17427
17428SDValue SITargetLowering::performSelectCombine(SDNode *N,
17429 DAGCombinerInfo &DCI) const {
17430
17431 // Try to fold CMP + SELECT patterns with shared constants (both FP and
17432 // integer).
17433 // Detect when CMP and SELECT use the same constant and fold them to avoid
17434 // loading the constant twice. Specifically handles patterns like:
17435 // %cmp = icmp eq i32 %val, 4242
17436 // %sel = select i1 %cmp, i32 4242, i32 %other
17437 // It can be optimized to reuse %val instead of 4242 in select.
17438 SDValue Cond = N->getOperand(0);
17439 SDValue TrueVal = N->getOperand(1);
17440 SDValue FalseVal = N->getOperand(2);
17441
17442 // Check if condition is a comparison.
17443 if (Cond.getOpcode() != ISD::SETCC)
17444 return SDValue();
17445
17446 SDValue LHS = Cond.getOperand(0);
17447 SDValue RHS = Cond.getOperand(1);
17448 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
17449
17450 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17451 bool isInteger = LHS.getValueType().isInteger();
17452
17453 // Handle simple floating-point and integer types only.
17454 if (!isFloatingPoint && !isInteger)
17455 return SDValue();
17456
17457 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17458 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17459 if (!isEquality && !isNonEquality)
17460 return SDValue();
17461
17462 SDValue ArgVal, ConstVal;
17463 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
17464 (isInteger && isa<ConstantSDNode>(RHS))) {
17465 ConstVal = RHS;
17466 ArgVal = LHS;
17467 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
17468 (isInteger && isa<ConstantSDNode>(LHS))) {
17469 ConstVal = LHS;
17470 ArgVal = RHS;
17471 } else {
17472 return SDValue();
17473 }
17474
17475 // Skip optimization for inlinable immediates.
17476 if (isFloatingPoint) {
17477 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
17478 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17479 return SDValue();
17480 } else {
17482 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
17483 return SDValue();
17484 }
17485
17486 // For equality and non-equality comparisons, patterns:
17487 // select (setcc x, const), const, y -> select (setcc x, const), x, y
17488 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17489 if (!(isEquality && TrueVal == ConstVal) &&
17490 !(isNonEquality && FalseVal == ConstVal))
17491 return SDValue();
17492
17493 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17494 SDValue SelectRHS =
17495 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17496 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
17497 SelectLHS, SelectRHS);
17498}
17499
17501 DAGCombinerInfo &DCI) const {
17502 switch (N->getOpcode()) {
17503 case ISD::ADD:
17504 case ISD::SUB:
17505 case ISD::SHL:
17506 case ISD::SRL:
17507 case ISD::SRA:
17508 case ISD::AND:
17509 case ISD::OR:
17510 case ISD::XOR:
17511 case ISD::MUL:
17512 case ISD::SETCC:
17513 case ISD::SELECT:
17514 case ISD::SMIN:
17515 case ISD::SMAX:
17516 case ISD::UMIN:
17517 case ISD::UMAX:
17518 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
17519 return Res;
17520 break;
17521 default:
17522 break;
17523 }
17524
17525 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17526 return SDValue();
17527
17528 switch (N->getOpcode()) {
17529 case ISD::ADD:
17530 return performAddCombine(N, DCI);
17531 case ISD::PTRADD:
17532 return performPtrAddCombine(N, DCI);
17533 case ISD::SUB:
17534 return performSubCombine(N, DCI);
17535 case ISD::UADDO_CARRY:
17536 case ISD::USUBO_CARRY:
17537 return performAddCarrySubCarryCombine(N, DCI);
17538 case ISD::FADD:
17539 return performFAddCombine(N, DCI);
17540 case ISD::FSUB:
17541 return performFSubCombine(N, DCI);
17542 case ISD::FDIV:
17543 return performFDivCombine(N, DCI);
17544 case ISD::FMUL:
17545 return performFMulCombine(N, DCI);
17546 case ISD::SETCC:
17547 return performSetCCCombine(N, DCI);
17548 case ISD::SELECT:
17549 if (auto Res = performSelectCombine(N, DCI))
17550 return Res;
17551 break;
17552 case ISD::FMAXNUM:
17553 case ISD::FMINNUM:
17554 case ISD::FMAXNUM_IEEE:
17555 case ISD::FMINNUM_IEEE:
17556 case ISD::FMAXIMUM:
17557 case ISD::FMINIMUM:
17558 case ISD::FMAXIMUMNUM:
17559 case ISD::FMINIMUMNUM:
17560 case ISD::SMAX:
17561 case ISD::SMIN:
17562 case ISD::UMAX:
17563 case ISD::UMIN:
17564 case AMDGPUISD::FMIN_LEGACY:
17565 case AMDGPUISD::FMAX_LEGACY:
17566 return performMinMaxCombine(N, DCI);
17567 case ISD::FMA:
17568 return performFMACombine(N, DCI);
17569 case ISD::AND:
17570 return performAndCombine(N, DCI);
17571 case ISD::OR:
17572 return performOrCombine(N, DCI);
17573 case ISD::FSHR: {
17575 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
17576 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17577 return matchPERM(N, DCI);
17578 }
17579 break;
17580 }
17581 case ISD::XOR:
17582 return performXorCombine(N, DCI);
17583 case ISD::ANY_EXTEND:
17584 case ISD::ZERO_EXTEND:
17585 return performZeroOrAnyExtendCombine(N, DCI);
17587 return performSignExtendInRegCombine(N, DCI);
17588 case AMDGPUISD::FP_CLASS:
17589 return performClassCombine(N, DCI);
17590 case ISD::FCANONICALIZE:
17591 return performFCanonicalizeCombine(N, DCI);
17592 case AMDGPUISD::RCP:
17593 return performRcpCombine(N, DCI);
17594 case ISD::FLDEXP:
17595 case AMDGPUISD::FRACT:
17596 case AMDGPUISD::RSQ:
17597 case AMDGPUISD::RCP_LEGACY:
17598 case AMDGPUISD::RCP_IFLAG:
17599 case AMDGPUISD::RSQ_CLAMP: {
17600 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17601 SDValue Src = N->getOperand(0);
17602 if (Src.isUndef())
17603 return Src;
17604 break;
17605 }
17606 case ISD::SINT_TO_FP:
17607 case ISD::UINT_TO_FP:
17608 return performUCharToFloatCombine(N, DCI);
17609 case ISD::FCOPYSIGN:
17610 return performFCopySignCombine(N, DCI);
17611 case AMDGPUISD::CVT_F32_UBYTE0:
17612 case AMDGPUISD::CVT_F32_UBYTE1:
17613 case AMDGPUISD::CVT_F32_UBYTE2:
17614 case AMDGPUISD::CVT_F32_UBYTE3:
17615 return performCvtF32UByteNCombine(N, DCI);
17616 case AMDGPUISD::FMED3:
17617 return performFMed3Combine(N, DCI);
17618 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17619 return performCvtPkRTZCombine(N, DCI);
17620 case AMDGPUISD::CLAMP:
17621 return performClampCombine(N, DCI);
17622 case ISD::SCALAR_TO_VECTOR: {
17623 SelectionDAG &DAG = DCI.DAG;
17624 EVT VT = N->getValueType(0);
17625
17626 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17627 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17628 SDLoc SL(N);
17629 SDValue Src = N->getOperand(0);
17630 EVT EltVT = Src.getValueType();
17631 if (EltVT != MVT::i16)
17632 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17633
17634 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17635 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17636 }
17637
17638 break;
17639 }
17641 return performExtractVectorEltCombine(N, DCI);
17643 return performInsertVectorEltCombine(N, DCI);
17644 case ISD::FP_ROUND:
17645 return performFPRoundCombine(N, DCI);
17646 case ISD::LOAD: {
17647 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17648 return Widened;
17649 [[fallthrough]];
17650 }
17651 default: {
17652 if (!DCI.isBeforeLegalize()) {
17653 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17654 return performMemSDNodeCombine(MemNode, DCI);
17655 }
17656
17657 break;
17658 }
17659 }
17660
17662}
17663
17664/// Helper function for adjustWritemask
17665static unsigned SubIdx2Lane(unsigned Idx) {
17666 switch (Idx) {
17667 default:
17668 return ~0u;
17669 case AMDGPU::sub0:
17670 return 0;
17671 case AMDGPU::sub1:
17672 return 1;
17673 case AMDGPU::sub2:
17674 return 2;
17675 case AMDGPU::sub3:
17676 return 3;
17677 case AMDGPU::sub4:
17678 return 4; // Possible with TFE/LWE
17679 }
17680}
17681
17682/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17683SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17684 SelectionDAG &DAG) const {
17685 unsigned Opcode = Node->getMachineOpcode();
17686
17687 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17688 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17689 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17690 return Node; // not implemented for D16
17691
17692 SDNode *Users[5] = {nullptr};
17693 unsigned Lane = 0;
17694 unsigned DmaskIdx =
17695 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17696 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17697 unsigned NewDmask = 0;
17698 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17699 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17700 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17701 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17702 unsigned TFCLane = 0;
17703 bool HasChain = Node->getNumValues() > 1;
17704
17705 if (OldDmask == 0) {
17706 // These are folded out, but on the chance it happens don't assert.
17707 return Node;
17708 }
17709
17710 unsigned OldBitsSet = llvm::popcount(OldDmask);
17711 // Work out which is the TFE/LWE lane if that is enabled.
17712 if (UsesTFC) {
17713 TFCLane = OldBitsSet;
17714 }
17715
17716 // Try to figure out the used register components
17717 for (SDUse &Use : Node->uses()) {
17718
17719 // Don't look at users of the chain.
17720 if (Use.getResNo() != 0)
17721 continue;
17722
17723 SDNode *User = Use.getUser();
17724
17725 // Abort if we can't understand the usage
17726 if (!User->isMachineOpcode() ||
17727 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17728 return Node;
17729
17730 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17731 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17732 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17733 // set, etc.
17734 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17735 if (Lane == ~0u)
17736 return Node;
17737
17738 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17739 if (UsesTFC && Lane == TFCLane) {
17740 Users[Lane] = User;
17741 } else {
17742 // Set which texture component corresponds to the lane.
17743 unsigned Comp;
17744 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17745 Comp = llvm::countr_zero(Dmask);
17746 Dmask &= ~(1 << Comp);
17747 }
17748
17749 // Abort if we have more than one user per component.
17750 if (Users[Lane])
17751 return Node;
17752
17753 Users[Lane] = User;
17754 NewDmask |= 1 << Comp;
17755 }
17756 }
17757
17758 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17759 bool NoChannels = !NewDmask;
17760 if (NoChannels) {
17761 if (!UsesTFC) {
17762 // No uses of the result and not using TFC. Then do nothing.
17763 return Node;
17764 }
17765 // If the original dmask has one channel - then nothing to do
17766 if (OldBitsSet == 1)
17767 return Node;
17768 // Use an arbitrary dmask - required for the instruction to work
17769 NewDmask = 1;
17770 }
17771 // Abort if there's no change
17772 if (NewDmask == OldDmask)
17773 return Node;
17774
17775 unsigned BitsSet = llvm::popcount(NewDmask);
17776
17777 // Check for TFE or LWE - increase the number of channels by one to account
17778 // for the extra return value
17779 // This will need adjustment for D16 if this is also included in
17780 // adjustWriteMask (this function) but at present D16 are excluded.
17781 unsigned NewChannels = BitsSet + UsesTFC;
17782
17783 int NewOpcode =
17784 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17785 assert(NewOpcode != -1 &&
17786 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17787 "failed to find equivalent MIMG op");
17788
17789 // Adjust the writemask in the node
17791 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17792 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17793 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17794
17795 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17796
17797 MVT ResultVT = NewChannels == 1
17798 ? SVT
17799 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17800 : NewChannels == 5 ? 8
17801 : NewChannels);
17802 SDVTList NewVTList =
17803 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17804
17805 MachineSDNode *NewNode =
17806 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17807
17808 if (HasChain) {
17809 // Update chain.
17810 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17811 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17812 }
17813
17814 if (NewChannels == 1) {
17815 assert(Node->hasNUsesOfValue(1, 0));
17816 SDNode *Copy =
17817 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17818 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17819 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17820 return nullptr;
17821 }
17822
17823 // Update the users of the node with the new indices
17824 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17825 SDNode *User = Users[i];
17826 if (!User) {
17827 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17828 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17829 if (i || !NoChannels)
17830 continue;
17831 } else {
17832 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17833 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17834 if (NewUser != User) {
17835 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17836 DAG.RemoveDeadNode(User);
17837 }
17838 }
17839
17840 switch (Idx) {
17841 default:
17842 break;
17843 case AMDGPU::sub0:
17844 Idx = AMDGPU::sub1;
17845 break;
17846 case AMDGPU::sub1:
17847 Idx = AMDGPU::sub2;
17848 break;
17849 case AMDGPU::sub2:
17850 Idx = AMDGPU::sub3;
17851 break;
17852 case AMDGPU::sub3:
17853 Idx = AMDGPU::sub4;
17854 break;
17855 }
17856 }
17857
17858 DAG.RemoveDeadNode(Node);
17859 return nullptr;
17860}
17861
17863 if (Op.getOpcode() == ISD::AssertZext)
17864 Op = Op.getOperand(0);
17865
17866 return isa<FrameIndexSDNode>(Op);
17867}
17868
17869/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17870/// with frame index operands.
17871/// LLVM assumes that inputs are to these instructions are registers.
17872SDNode *
17874 SelectionDAG &DAG) const {
17875 if (Node->getOpcode() == ISD::CopyToReg) {
17876 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17877 SDValue SrcVal = Node->getOperand(2);
17878
17879 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17880 // to try understanding copies to physical registers.
17881 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17882 SDLoc SL(Node);
17884 SDValue VReg = DAG.getRegister(
17885 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17886
17887 SDNode *Glued = Node->getGluedNode();
17888 SDValue ToVReg = DAG.getCopyToReg(
17889 Node->getOperand(0), SL, VReg, SrcVal,
17890 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17891 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17892 VReg, ToVReg.getValue(1));
17893 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17894 DAG.RemoveDeadNode(Node);
17895 return ToResultReg.getNode();
17896 }
17897 }
17898
17900 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17901 if (!isFrameIndexOp(Node->getOperand(i))) {
17902 Ops.push_back(Node->getOperand(i));
17903 continue;
17904 }
17905
17906 SDLoc DL(Node);
17907 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17908 Node->getOperand(i).getValueType(),
17909 Node->getOperand(i)),
17910 0));
17911 }
17912
17913 return DAG.UpdateNodeOperands(Node, Ops);
17914}
17915
17916/// Fold the instructions after selecting them.
17917/// Returns null if users were already updated.
17919 SelectionDAG &DAG) const {
17921 unsigned Opcode = Node->getMachineOpcode();
17922
17923 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17924 !TII->isGather4(Opcode) &&
17925 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17926 return adjustWritemask(Node, DAG);
17927 }
17928
17929 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17931 return Node;
17932 }
17933
17934 switch (Opcode) {
17935 case AMDGPU::V_DIV_SCALE_F32_e64:
17936 case AMDGPU::V_DIV_SCALE_F64_e64: {
17937 // Satisfy the operand register constraint when one of the inputs is
17938 // undefined. Ordinarily each undef value will have its own implicit_def of
17939 // a vreg, so force these to use a single register.
17940 SDValue Src0 = Node->getOperand(1);
17941 SDValue Src1 = Node->getOperand(3);
17942 SDValue Src2 = Node->getOperand(5);
17943
17944 if ((Src0.isMachineOpcode() &&
17945 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17946 (Src0 == Src1 || Src0 == Src2))
17947 break;
17948
17949 MVT VT = Src0.getValueType().getSimpleVT();
17950 const TargetRegisterClass *RC =
17951 getRegClassFor(VT, Src0.getNode()->isDivergent());
17952
17954 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17955
17956 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17957 Src0, SDValue());
17958
17959 // src0 must be the same register as src1 or src2, even if the value is
17960 // undefined, so make sure we don't violate this constraint.
17961 if (Src0.isMachineOpcode() &&
17962 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17963 if (Src1.isMachineOpcode() &&
17964 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17965 Src0 = Src1;
17966 else if (Src2.isMachineOpcode() &&
17967 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17968 Src0 = Src2;
17969 else {
17970 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17971 Src0 = UndefReg;
17972 Src1 = UndefReg;
17973 }
17974 } else
17975 break;
17976
17978 Ops[1] = Src0;
17979 Ops[3] = Src1;
17980 Ops[5] = Src2;
17981 Ops.push_back(ImpDef.getValue(1));
17982 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17983 }
17984 default:
17985 break;
17986 }
17987
17988 return Node;
17989}
17990
17991// Any MIMG instructions that use tfe or lwe require an initialization of the
17992// result register that will be written in the case of a memory access failure.
17993// The required code is also added to tie this init code to the result of the
17994// img instruction.
17997 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17998 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17999 MachineBasicBlock &MBB = *MI.getParent();
18000
18001 int DstIdx =
18002 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
18003 unsigned InitIdx = 0;
18004
18005 if (TII->isImage(MI)) {
18006 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
18007 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
18008 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
18009
18010 if (!TFE && !LWE) // intersect_ray
18011 return;
18012
18013 unsigned TFEVal = TFE ? TFE->getImm() : 0;
18014 unsigned LWEVal = LWE ? LWE->getImm() : 0;
18015 unsigned D16Val = D16 ? D16->getImm() : 0;
18016
18017 if (!TFEVal && !LWEVal)
18018 return;
18019
18020 // At least one of TFE or LWE are non-zero
18021 // We have to insert a suitable initialization of the result value and
18022 // tie this to the dest of the image instruction.
18023
18024 // Calculate which dword we have to initialize to 0.
18025 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
18026
18027 // check that dmask operand is found.
18028 assert(MO_Dmask && "Expected dmask operand in instruction");
18029
18030 unsigned dmask = MO_Dmask->getImm();
18031 // Determine the number of active lanes taking into account the
18032 // Gather4 special case
18033 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
18034
18035 bool Packed = !Subtarget->hasUnpackedD16VMem();
18036
18037 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18038
18039 // Abandon attempt if the dst size isn't large enough
18040 // - this is in fact an error but this is picked up elsewhere and
18041 // reported correctly.
18042 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
18043
18044 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
18045 if (DstSize < InitIdx)
18046 return;
18047 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
18048 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
18049 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
18050 } else {
18051 return;
18052 }
18053
18054 const DebugLoc &DL = MI.getDebugLoc();
18055
18056 // Create a register for the initialization value.
18057 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
18058 unsigned NewDst = 0; // Final initialized value will be in here
18059
18060 // If PRTStrictNull feature is enabled (the default) then initialize
18061 // all the result registers to 0, otherwise just the error indication
18062 // register (VGPRn+1)
18063 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18064 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18065
18066 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
18067 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18068 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
18069 // Initialize dword
18070 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
18071 // clang-format off
18072 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
18073 .addImm(0);
18074 // clang-format on
18075 // Insert into the super-reg
18076 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
18077 .addReg(PrevDst)
18078 .addReg(SubReg)
18080
18081 PrevDst = NewDst;
18082 }
18083
18084 // Add as an implicit operand
18085 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
18086
18087 // Tie the just added implicit operand to the dst
18088 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
18089}
18090
18091/// Assign the register class depending on the number of
18092/// bits set in the writemask
18094 SDNode *Node) const {
18096
18097 MachineFunction *MF = MI.getMF();
18099
18100 if (TII->isVOP3(MI.getOpcode())) {
18101 // Make sure constant bus requirements are respected.
18102 TII->legalizeOperandsVOP3(MRI, MI);
18103
18104 if (TII->isMAI(MI)) {
18105 // The ordinary src0, src1, src2 were legalized above.
18106 //
18107 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
18108 // as a separate instruction.
18109 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
18110 AMDGPU::OpName::scale_src0);
18111 if (Src0Idx != -1) {
18112 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
18113 AMDGPU::OpName::scale_src1);
18114 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
18115 TII->usesConstantBus(MRI, MI, Src1Idx))
18116 TII->legalizeOpWithMove(MI, Src1Idx);
18117 }
18118 }
18119
18120 return;
18121 }
18122
18123 if (TII->isImage(MI))
18124 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
18125}
18126
18128 uint64_t Val) {
18129 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
18130 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
18131}
18132
18134 const SDLoc &DL,
18135 SDValue Ptr) const {
18137
18138 // Build the half of the subregister with the constants before building the
18139 // full 128-bit register. If we are building multiple resource descriptors,
18140 // this will allow CSEing of the 2-component register.
18141 const SDValue Ops0[] = {
18142 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
18143 buildSMovImm32(DAG, DL, 0),
18144 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
18145 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
18146 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
18147
18148 SDValue SubRegHi = SDValue(
18149 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
18150
18151 // Combine the constants and the pointer.
18152 const SDValue Ops1[] = {
18153 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
18154 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
18155 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
18156
18157 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
18158}
18159
18160/// Return a resource descriptor with the 'Add TID' bit enabled
18161/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
18162/// of the resource descriptor) to create an offset, which is added to
18163/// the resource pointer.
18165 SDValue Ptr, uint32_t RsrcDword1,
18166 uint64_t RsrcDword2And3) const {
18167 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
18168 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
18169 if (RsrcDword1) {
18170 PtrHi =
18171 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
18172 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
18173 0);
18174 }
18175
18176 SDValue DataLo =
18177 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
18178 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
18179
18180 const SDValue Ops[] = {
18181 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
18182 PtrLo,
18183 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
18184 PtrHi,
18185 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
18186 DataLo,
18187 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
18188 DataHi,
18189 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
18190
18191 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
18192}
18193
18194//===----------------------------------------------------------------------===//
18195// SI Inline Assembly Support
18196//===----------------------------------------------------------------------===//
18197
18198std::pair<unsigned, const TargetRegisterClass *>
18200 StringRef Constraint,
18201 MVT VT) const {
18202 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
18203
18204 const TargetRegisterClass *RC = nullptr;
18205 if (Constraint.size() == 1) {
18206 // Check if we cannot determine the bit size of the given value type. This
18207 // can happen, for example, in this situation where we have an empty struct
18208 // (size 0): `call void asm "", "v"({} poison)`-
18209 if (VT == MVT::Other)
18210 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18211 const unsigned BitWidth = VT.getSizeInBits();
18212 switch (Constraint[0]) {
18213 default:
18214 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18215 case 's':
18216 case 'r':
18217 switch (BitWidth) {
18218 case 16:
18219 RC = &AMDGPU::SReg_32RegClass;
18220 break;
18221 case 64:
18222 RC = &AMDGPU::SGPR_64RegClass;
18223 break;
18224 default:
18226 if (!RC)
18227 return std::pair(0U, nullptr);
18228 break;
18229 }
18230 break;
18231 case 'v':
18232 switch (BitWidth) {
18233 case 1:
18234 return std::pair(0U, nullptr);
18235 case 16:
18236 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18237 : &AMDGPU::VGPR_32_Lo256RegClass;
18238 break;
18239 default:
18240 RC = Subtarget->has1024AddressableVGPRs()
18241 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
18242 : TRI->getVGPRClassForBitWidth(BitWidth);
18243 if (!RC)
18244 return std::pair(0U, nullptr);
18245 break;
18246 }
18247 break;
18248 case 'a':
18249 if (!Subtarget->hasMAIInsts())
18250 break;
18251 switch (BitWidth) {
18252 case 1:
18253 return std::pair(0U, nullptr);
18254 case 16:
18255 RC = &AMDGPU::AGPR_32RegClass;
18256 break;
18257 default:
18258 RC = TRI->getAGPRClassForBitWidth(BitWidth);
18259 if (!RC)
18260 return std::pair(0U, nullptr);
18261 break;
18262 }
18263 break;
18264 }
18265 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
18266 const unsigned BitWidth = VT.getSizeInBits();
18267 switch (BitWidth) {
18268 case 16:
18269 RC = &AMDGPU::AV_32RegClass;
18270 break;
18271 default:
18272 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
18273 if (!RC)
18274 return std::pair(0U, nullptr);
18275 break;
18276 }
18277 }
18278
18279 // We actually support i128, i16 and f16 as inline parameters
18280 // even if they are not reported as legal
18281 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
18282 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
18283 return std::pair(0U, RC);
18284
18285 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
18286 if (Kind != '\0') {
18287 if (Kind == 'v') {
18288 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18289 } else if (Kind == 's') {
18290 RC = &AMDGPU::SGPR_32RegClass;
18291 } else if (Kind == 'a') {
18292 RC = &AMDGPU::AGPR_32RegClass;
18293 }
18294
18295 if (RC) {
18296 if (NumRegs > 1) {
18297 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
18298 return std::pair(0U, nullptr);
18299
18300 uint32_t Width = NumRegs * 32;
18301 // Prohibit constraints for register ranges with a width that does not
18302 // match the required type.
18303 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
18304 return std::pair(0U, nullptr);
18305
18306 MCRegister Reg = RC->getRegister(Idx);
18308 RC = TRI->getVGPRClassForBitWidth(Width);
18309 else if (SIRegisterInfo::isSGPRClass(RC))
18310 RC = TRI->getSGPRClassForBitWidth(Width);
18311 else if (SIRegisterInfo::isAGPRClass(RC))
18312 RC = TRI->getAGPRClassForBitWidth(Width);
18313 if (RC) {
18314 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
18315 if (!Reg) {
18316 // The register class does not contain the requested register,
18317 // e.g., because it is an SGPR pair that would violate alignment
18318 // requirements.
18319 return std::pair(0U, nullptr);
18320 }
18321 return std::pair(Reg, RC);
18322 }
18323 }
18324
18325 // Check for lossy scalar/vector conversions.
18326 if (VT.isVector() && VT.getSizeInBits() != 32)
18327 return std::pair(0U, nullptr);
18328 if (Idx < RC->getNumRegs())
18329 return std::pair(RC->getRegister(Idx), RC);
18330 return std::pair(0U, nullptr);
18331 }
18332 }
18333
18334 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18335 if (Ret.first)
18336 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
18337
18338 return Ret;
18339}
18340
18341static bool isImmConstraint(StringRef Constraint) {
18342 if (Constraint.size() == 1) {
18343 switch (Constraint[0]) {
18344 default:
18345 break;
18346 case 'I':
18347 case 'J':
18348 case 'A':
18349 case 'B':
18350 case 'C':
18351 return true;
18352 }
18353 } else if (Constraint == "DA" || Constraint == "DB") {
18354 return true;
18355 }
18356 return false;
18357}
18358
18361 if (Constraint.size() == 1) {
18362 switch (Constraint[0]) {
18363 default:
18364 break;
18365 case 's':
18366 case 'v':
18367 case 'a':
18368 return C_RegisterClass;
18369 }
18370 } else if (Constraint.size() == 2) {
18371 if (Constraint == "VA")
18372 return C_RegisterClass;
18373 }
18374 if (isImmConstraint(Constraint)) {
18375 return C_Other;
18376 }
18377 return TargetLowering::getConstraintType(Constraint);
18378}
18379
18380static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
18382 Val = Val & maskTrailingOnes<uint64_t>(Size);
18383 }
18384 return Val;
18385}
18386
18388 StringRef Constraint,
18389 std::vector<SDValue> &Ops,
18390 SelectionDAG &DAG) const {
18391 if (isImmConstraint(Constraint)) {
18392 uint64_t Val;
18393 if (getAsmOperandConstVal(Op, Val) &&
18394 checkAsmConstraintVal(Op, Constraint, Val)) {
18395 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
18396 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
18397 }
18398 } else {
18400 }
18401}
18402
18404 unsigned Size = Op.getScalarValueSizeInBits();
18405 if (Size > 64)
18406 return false;
18407
18408 if (Size == 16 && !Subtarget->has16BitInsts())
18409 return false;
18410
18412 Val = C->getSExtValue();
18413 return true;
18414 }
18416 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18417 return true;
18418 }
18420 if (Size != 16 || Op.getNumOperands() != 2)
18421 return false;
18422 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
18423 return false;
18424 if (ConstantSDNode *C = V->getConstantSplatNode()) {
18425 Val = C->getSExtValue();
18426 return true;
18427 }
18428 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
18429 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18430 return true;
18431 }
18432 }
18433
18434 return false;
18435}
18436
18438 uint64_t Val) const {
18439 if (Constraint.size() == 1) {
18440 switch (Constraint[0]) {
18441 case 'I':
18443 case 'J':
18444 return isInt<16>(Val);
18445 case 'A':
18446 return checkAsmConstraintValA(Op, Val);
18447 case 'B':
18448 return isInt<32>(Val);
18449 case 'C':
18450 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
18452 default:
18453 break;
18454 }
18455 } else if (Constraint.size() == 2) {
18456 if (Constraint == "DA") {
18457 int64_t HiBits = static_cast<int32_t>(Val >> 32);
18458 int64_t LoBits = static_cast<int32_t>(Val);
18459 return checkAsmConstraintValA(Op, HiBits, 32) &&
18460 checkAsmConstraintValA(Op, LoBits, 32);
18461 }
18462 if (Constraint == "DB") {
18463 return true;
18464 }
18465 }
18466 llvm_unreachable("Invalid asm constraint");
18467}
18468
18470 unsigned MaxSize) const {
18471 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
18472 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18473 if (Size == 16) {
18474 MVT VT = Op.getSimpleValueType();
18475 switch (VT.SimpleTy) {
18476 default:
18477 return false;
18478 case MVT::i16:
18479 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
18480 case MVT::f16:
18481 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
18482 case MVT::bf16:
18483 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
18484 case MVT::v2i16:
18485 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
18486 case MVT::v2f16:
18487 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
18488 case MVT::v2bf16:
18489 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
18490 }
18491 }
18492 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
18493 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
18494 return true;
18495 return false;
18496}
18497
18498static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18499 switch (UnalignedClassID) {
18500 case AMDGPU::VReg_64RegClassID:
18501 return AMDGPU::VReg_64_Align2RegClassID;
18502 case AMDGPU::VReg_96RegClassID:
18503 return AMDGPU::VReg_96_Align2RegClassID;
18504 case AMDGPU::VReg_128RegClassID:
18505 return AMDGPU::VReg_128_Align2RegClassID;
18506 case AMDGPU::VReg_160RegClassID:
18507 return AMDGPU::VReg_160_Align2RegClassID;
18508 case AMDGPU::VReg_192RegClassID:
18509 return AMDGPU::VReg_192_Align2RegClassID;
18510 case AMDGPU::VReg_224RegClassID:
18511 return AMDGPU::VReg_224_Align2RegClassID;
18512 case AMDGPU::VReg_256RegClassID:
18513 return AMDGPU::VReg_256_Align2RegClassID;
18514 case AMDGPU::VReg_288RegClassID:
18515 return AMDGPU::VReg_288_Align2RegClassID;
18516 case AMDGPU::VReg_320RegClassID:
18517 return AMDGPU::VReg_320_Align2RegClassID;
18518 case AMDGPU::VReg_352RegClassID:
18519 return AMDGPU::VReg_352_Align2RegClassID;
18520 case AMDGPU::VReg_384RegClassID:
18521 return AMDGPU::VReg_384_Align2RegClassID;
18522 case AMDGPU::VReg_512RegClassID:
18523 return AMDGPU::VReg_512_Align2RegClassID;
18524 case AMDGPU::VReg_1024RegClassID:
18525 return AMDGPU::VReg_1024_Align2RegClassID;
18526 case AMDGPU::AReg_64RegClassID:
18527 return AMDGPU::AReg_64_Align2RegClassID;
18528 case AMDGPU::AReg_96RegClassID:
18529 return AMDGPU::AReg_96_Align2RegClassID;
18530 case AMDGPU::AReg_128RegClassID:
18531 return AMDGPU::AReg_128_Align2RegClassID;
18532 case AMDGPU::AReg_160RegClassID:
18533 return AMDGPU::AReg_160_Align2RegClassID;
18534 case AMDGPU::AReg_192RegClassID:
18535 return AMDGPU::AReg_192_Align2RegClassID;
18536 case AMDGPU::AReg_256RegClassID:
18537 return AMDGPU::AReg_256_Align2RegClassID;
18538 case AMDGPU::AReg_512RegClassID:
18539 return AMDGPU::AReg_512_Align2RegClassID;
18540 case AMDGPU::AReg_1024RegClassID:
18541 return AMDGPU::AReg_1024_Align2RegClassID;
18542 default:
18543 return -1;
18544 }
18545}
18546
18547// Figure out which registers should be reserved for stack access. Only after
18548// the function is legalized do we know all of the non-spill stack objects or if
18549// calls are present.
18553 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18554 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18555 const SIInstrInfo *TII = ST.getInstrInfo();
18556
18557 if (Info->isEntryFunction()) {
18558 // Callable functions have fixed registers used for stack access.
18560 }
18561
18562 // TODO: Move this logic to getReservedRegs()
18563 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18564 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18565 Register SReg = ST.isWave32()
18566 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18567 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18568 &AMDGPU::SGPR_64RegClass);
18569 Info->setSGPRForEXECCopy(SReg);
18570
18571 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18572 Info->getStackPtrOffsetReg()));
18573 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18574 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18575
18576 // We need to worry about replacing the default register with itself in case
18577 // of MIR testcases missing the MFI.
18578 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18579 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18580
18581 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18582 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18583
18584 Info->limitOccupancy(MF);
18585
18586 if (ST.isWave32() && !MF.empty()) {
18587 for (auto &MBB : MF) {
18588 for (auto &MI : MBB) {
18589 TII->fixImplicitOperands(MI);
18590 }
18591 }
18592 }
18593
18594 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18595 // classes if required. Ideally the register class constraints would differ
18596 // per-subtarget, but there's no easy way to achieve that right now. This is
18597 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18598 // from using them as the register class for legal types.
18599 if (ST.needsAlignedVGPRs()) {
18600 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18601 const Register Reg = Register::index2VirtReg(I);
18602 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18603 if (!RC)
18604 continue;
18605 int NewClassID = getAlignedAGPRClassID(RC->getID());
18606 if (NewClassID != -1)
18607 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18608 }
18609 }
18610
18612}
18613
18615 KnownBits &Known,
18616 const APInt &DemandedElts,
18617 const SelectionDAG &DAG,
18618 unsigned Depth) const {
18619 Known.resetAll();
18620 unsigned Opc = Op.getOpcode();
18621 switch (Opc) {
18623 unsigned IID = Op.getConstantOperandVal(0);
18624 switch (IID) {
18625 case Intrinsic::amdgcn_mbcnt_lo:
18626 case Intrinsic::amdgcn_mbcnt_hi: {
18627 const GCNSubtarget &ST =
18629 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18630 // most 31 + src1.
18631 Known.Zero.setBitsFrom(
18632 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18633 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18634 Known = KnownBits::add(Known, Known2);
18635 return;
18636 }
18637 }
18638 break;
18639 }
18640 }
18642 Op, Known, DemandedElts, DAG, Depth);
18643}
18644
18646 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18648
18649 // Set the high bits to zero based on the maximum allowed scratch size per
18650 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18651 // calculation won't overflow, so assume the sign bit is never set.
18652 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18653}
18654
18656 GISelValueTracking &VT, KnownBits &Known,
18657 unsigned Dim) {
18658 unsigned MaxValue =
18659 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18660 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18661}
18662
18664 KnownBits &Known, const APInt &DemandedElts,
18665 unsigned BFEWidth, bool SExt, unsigned Depth) {
18667 const MachineOperand &Src1 = MI.getOperand(2);
18668
18669 unsigned Src1Cst = 0;
18670 if (Src1.isImm()) {
18671 Src1Cst = Src1.getImm();
18672 } else if (Src1.isReg()) {
18673 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18674 if (!Cst)
18675 return;
18676 Src1Cst = Cst->Value.getZExtValue();
18677 } else {
18678 return;
18679 }
18680
18681 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18682 // Width is always [22:16].
18683 const unsigned Offset =
18684 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18685 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18686
18687 if (Width >= BFEWidth) // Ill-formed.
18688 return;
18689
18690 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18691 Depth + 1);
18692
18693 Known = Known.extractBits(Width, Offset);
18694
18695 if (SExt)
18696 Known = Known.sext(BFEWidth);
18697 else
18698 Known = Known.zext(BFEWidth);
18699}
18700
18702 GISelValueTracking &VT, Register R, KnownBits &Known,
18703 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18704 unsigned Depth) const {
18705 Known.resetAll();
18706 const MachineInstr *MI = MRI.getVRegDef(R);
18707 switch (MI->getOpcode()) {
18708 case AMDGPU::S_BFE_I32:
18709 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18710 /*SExt=*/true, Depth);
18711 case AMDGPU::S_BFE_U32:
18712 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18713 /*SExt=*/false, Depth);
18714 case AMDGPU::S_BFE_I64:
18715 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18716 /*SExt=*/true, Depth);
18717 case AMDGPU::S_BFE_U64:
18718 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18719 /*SExt=*/false, Depth);
18720 case AMDGPU::G_INTRINSIC:
18721 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18722 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18723 switch (IID) {
18724 case Intrinsic::amdgcn_workitem_id_x:
18725 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18726 break;
18727 case Intrinsic::amdgcn_workitem_id_y:
18728 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18729 break;
18730 case Intrinsic::amdgcn_workitem_id_z:
18731 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18732 break;
18733 case Intrinsic::amdgcn_mbcnt_lo:
18734 case Intrinsic::amdgcn_mbcnt_hi: {
18735 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18736 // most 31 + src1.
18737 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18738 ? getSubtarget()->getWavefrontSizeLog2()
18739 : 5);
18740 KnownBits Known2;
18741 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18742 Depth + 1);
18743 Known = KnownBits::add(Known, Known2);
18744 break;
18745 }
18746 case Intrinsic::amdgcn_groupstaticsize: {
18747 // We can report everything over the maximum size as 0. We can't report
18748 // based on the actual size because we don't know if it's accurate or not
18749 // at any given point.
18750 Known.Zero.setHighBits(
18751 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18752 break;
18753 }
18754 }
18755 break;
18756 }
18757 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18758 Known.Zero.setHighBits(24);
18759 break;
18760 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18761 Known.Zero.setHighBits(16);
18762 break;
18763 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
18764 // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
18765 // producing exactly 0 or 1.
18766 Known.Zero.setHighBits(Known.getBitWidth() - 1);
18767 break;
18768 case AMDGPU::G_AMDGPU_SMED3:
18769 case AMDGPU::G_AMDGPU_UMED3: {
18770 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18771
18772 KnownBits Known2;
18773 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18774 if (Known2.isUnknown())
18775 break;
18776
18777 KnownBits Known1;
18778 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18779 if (Known1.isUnknown())
18780 break;
18781
18782 KnownBits Known0;
18783 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18784 if (Known0.isUnknown())
18785 break;
18786
18787 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18788 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18789 Known.One = Known0.One & Known1.One & Known2.One;
18790 break;
18791 }
18792 }
18793}
18794
18797 unsigned Depth) const {
18798 const MachineInstr *MI = MRI.getVRegDef(R);
18799 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18800 // FIXME: Can this move to generic code? What about the case where the call
18801 // site specifies a lower alignment?
18802 Intrinsic::ID IID = GI->getIntrinsicID();
18804 AttributeList Attrs =
18805 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18806 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18807 return *RetAlign;
18808 }
18809 return Align(1);
18810}
18811
18814 const Align CacheLineAlign = Align(64);
18815
18816 // GFX950: Prevent an 8-byte instruction at loop header from being split by
18817 // the 32-byte instruction fetch window boundary. This avoids a significant
18818 // fetch delay after backward branch. We use 32-byte alignment with max
18819 // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
18820 if (ML && !DisableLoopAlignment &&
18821 getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
18822 const MachineBasicBlock *Header = ML->getHeader();
18823 // Respect user-specified or previously set alignment.
18824 if (Header->getAlignment() != PrefAlign)
18825 return Header->getAlignment();
18826 if (needsFetchWindowAlignment(*Header))
18827 return Align(32);
18828 }
18829
18830 // Pre-GFX10 target did not benefit from loop alignment
18831 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18832 getSubtarget()->hasInstFwdPrefetchBug())
18833 return PrefAlign;
18834
18835 // On GFX10 I$ is 4 x 64 bytes cache lines.
18836 // By default prefetcher keeps one cache line behind and reads two ahead.
18837 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18838 // behind and one ahead.
18839 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18840 // If loop fits 64 bytes it always spans no more than two cache lines and
18841 // does not need an alignment.
18842 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18843 // Else if loop is less or equal 192 bytes we need two lines behind.
18844
18846 const MachineBasicBlock *Header = ML->getHeader();
18847 if (Header->getAlignment() != PrefAlign)
18848 return Header->getAlignment(); // Already processed.
18849
18850 unsigned LoopSize = 0;
18851 for (const MachineBasicBlock *MBB : ML->blocks()) {
18852 // If inner loop block is aligned assume in average half of the alignment
18853 // size to be added as nops.
18854 if (MBB != Header)
18855 LoopSize += MBB->getAlignment().value() / 2;
18856
18857 for (const MachineInstr &MI : *MBB) {
18858 LoopSize += TII->getInstSizeInBytes(MI);
18859 if (LoopSize > 192)
18860 return PrefAlign;
18861 }
18862 }
18863
18864 if (LoopSize <= 64)
18865 return PrefAlign;
18866
18867 if (LoopSize <= 128)
18868 return CacheLineAlign;
18869
18870 // If any of parent loops is surrounded by prefetch instructions do not
18871 // insert new for inner loop, which would reset parent's settings.
18872 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18873 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18874 auto I = Exit->getFirstNonDebugInstr();
18875 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18876 return CacheLineAlign;
18877 }
18878 }
18879
18880 MachineBasicBlock *Pre = ML->getLoopPreheader();
18881 MachineBasicBlock *Exit = ML->getExitBlock();
18882
18883 if (Pre && Exit) {
18884 auto PreTerm = Pre->getFirstTerminator();
18885 if (PreTerm == Pre->begin() ||
18886 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18887 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18888 .addImm(1); // prefetch 2 lines behind PC
18889
18890 auto ExitHead = Exit->getFirstNonDebugInstr();
18891 if (ExitHead == Exit->end() ||
18892 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18893 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18894 .addImm(2); // prefetch 1 line behind PC
18895 }
18896
18897 return CacheLineAlign;
18898}
18899
18901 MachineBasicBlock *MBB) const {
18902 // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
18903 // instruction could be split by the 32-byte fetch window boundary.
18904 // See getPrefLoopAlignment() for context.
18905 if (needsFetchWindowAlignment(*MBB))
18906 return 4;
18908}
18909
18910bool SITargetLowering::needsFetchWindowAlignment(
18911 const MachineBasicBlock &MBB) const {
18912 if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
18913 return false;
18915 for (const MachineInstr &MI : MBB) {
18916 if (MI.isMetaInstruction())
18917 continue;
18918 // Instructions larger than 4 bytes can be split by a 32-byte boundary.
18919 return TII->getInstSizeInBytes(MI) > 4;
18920 }
18921 return false;
18922}
18923
18924[[maybe_unused]]
18925static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18926 assert(N->getOpcode() == ISD::CopyFromReg);
18927 do {
18928 // Follow the chain until we find an INLINEASM node.
18929 N = N->getOperand(0).getNode();
18930 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18931 return true;
18932 } while (N->getOpcode() == ISD::CopyFromReg);
18933 return false;
18934}
18935
18938 UniformityInfo *UA) const {
18939 switch (N->getOpcode()) {
18940 case ISD::CopyFromReg: {
18941 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18942 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18943 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18944 Register Reg = R->getReg();
18945
18946 // FIXME: Why does this need to consider isLiveIn?
18947 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18948 return !TRI->isSGPRReg(MRI, Reg);
18949
18950 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18951 return UA->isDivergent(V);
18952
18954 return !TRI->isSGPRReg(MRI, Reg);
18955 }
18956 case ISD::LOAD: {
18957 const LoadSDNode *L = cast<LoadSDNode>(N);
18958 unsigned AS = L->getAddressSpace();
18959 // A flat load may access private memory.
18961 }
18962 case ISD::CALLSEQ_END:
18963 return true;
18965 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18967 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18968 case AMDGPUISD::ATOMIC_CMP_SWAP:
18969 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18970 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18971 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18972 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18973 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18974 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18975 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18976 case AMDGPUISD::BUFFER_ATOMIC_AND:
18977 case AMDGPUISD::BUFFER_ATOMIC_OR:
18978 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18979 case AMDGPUISD::BUFFER_ATOMIC_INC:
18980 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18981 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18982 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18983 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18984 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18985 // Target-specific read-modify-write atomics are sources of divergence.
18986 return true;
18987 default:
18988 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18989 // Generic read-modify-write atomics are sources of divergence.
18990 return A->readMem() && A->writeMem();
18991 }
18992 return false;
18993 }
18994}
18995
18997 EVT VT) const {
18998 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18999 case MVT::f32:
19001 case MVT::f64:
19002 case MVT::f16:
19004 default:
19005 return false;
19006 }
19007}
19008
19010 LLT Ty, const MachineFunction &MF) const {
19011 switch (Ty.getScalarSizeInBits()) {
19012 case 32:
19013 return !denormalModeIsFlushAllF32(MF);
19014 case 64:
19015 case 16:
19016 return !denormalModeIsFlushAllF64F16(MF);
19017 default:
19018 return false;
19019 }
19020}
19021
19023 const APInt &DemandedElts,
19024 const SelectionDAG &DAG,
19025 bool SNaN,
19026 unsigned Depth) const {
19027 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
19028 const MachineFunction &MF = DAG.getMachineFunction();
19030
19031 if (Info->getMode().DX10Clamp)
19032 return true; // Clamped to 0.
19033 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
19034 }
19035
19037 DAG, SNaN, Depth);
19038}
19039
19040// On older subtargets, global FP atomic instructions have a hardcoded FP mode
19041// and do not support FP32 denormals, and only support v2f16/f64 denormals.
19043 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
19044 return true;
19045
19046 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
19047 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
19048 if (DenormMode == DenormalMode::getPreserveSign())
19049 return true;
19050
19051 // TODO: Remove this.
19052 return RMW->getFunction()
19053 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
19054 .getValueAsBool();
19055}
19056
19058 LLVMContext &Ctx = RMW->getContext();
19059 StringRef MemScope =
19060 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
19061
19062 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
19063 << "Hardware instruction generated for atomic "
19064 << RMW->getOperationName(RMW->getOperation())
19065 << " operation at memory scope " << MemScope;
19066}
19067
19068static bool isV2F16OrV2BF16(Type *Ty) {
19069 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
19070 Type *EltTy = VT->getElementType();
19071 return VT->getNumElements() == 2 &&
19072 (EltTy->isHalfTy() || EltTy->isBFloatTy());
19073 }
19074
19075 return false;
19076}
19077
19078static bool isV2F16(Type *Ty) {
19080 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
19081}
19082
19083static bool isV2BF16(Type *Ty) {
19085 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
19086}
19087
19088/// \return true if atomicrmw integer ops work for the type.
19089static bool isAtomicRMWLegalIntTy(Type *Ty) {
19090 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
19091 unsigned BW = IT->getBitWidth();
19092 return BW == 32 || BW == 64;
19093 }
19094
19095 return false;
19096}
19097
19098/// \return true if this atomicrmw xchg type can be selected.
19099static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
19100 Type *Ty = RMW->getType();
19101 if (isAtomicRMWLegalIntTy(Ty))
19102 return true;
19103
19104 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
19105 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
19106 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
19107 return BW == 32 || BW == 64;
19108 }
19109
19110 if (Ty->isFloatTy() || Ty->isDoubleTy())
19111 return true;
19112
19114 return VT->getNumElements() == 2 &&
19115 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19116 }
19117
19118 return false;
19119}
19120
19121/// \returns true if it's valid to emit a native instruction for \p RMW, based
19122/// on the properties of the target memory.
19123static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
19124 const AtomicRMWInst *RMW,
19125 bool HasSystemScope) {
19126 // The remote/fine-grained access logic is different from the integer
19127 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
19128 // fine-grained access does not work, even for a device local allocation.
19129 //
19130 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
19131 // allocations work.
19132 if (HasSystemScope) {
19133 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19134 RMW->hasMetadata("amdgpu.no.remote.memory"))
19135 return true;
19136 if (Subtarget.hasEmulatedSystemScopeAtomics())
19137 return true;
19138 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19139 return true;
19140
19141 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
19142}
19143
19144/// \return Action to perform on AtomicRMWInsts for integer operations.
19151
19152/// Return if a flat address space atomicrmw can access private memory.
19154 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
19155 return !MD ||
19157}
19158
19161 // For GAS, lower to flat atomic.
19162 return STI.hasGloballyAddressableScratch()
19165}
19166
19169 unsigned AS = RMW->getPointerAddressSpace();
19170 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
19172
19173 // 64-bit flat atomics that dynamically reside in private memory will silently
19174 // be dropped.
19175 //
19176 // Note that we will emit a new copy of the original atomic in the expansion,
19177 // which will be incrementally relegalized.
19178 const DataLayout &DL = RMW->getFunction()->getDataLayout();
19179 if (AS == AMDGPUAS::FLAT_ADDRESS &&
19180 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
19183
19184 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
19186 ORE.emit([=]() {
19187 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
19188 });
19189 return Kind;
19190 };
19191
19192 auto SSID = RMW->getSyncScopeID();
19193 bool HasSystemScope =
19194 SSID == SyncScope::System ||
19195 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
19196
19197 auto Op = RMW->getOperation();
19198 switch (Op) {
19200 // PCIe supports add and xchg for system atomics.
19201 return isAtomicRMWLegalXChgTy(RMW)
19204 case AtomicRMWInst::Add:
19205 // PCIe supports add and xchg for system atomics.
19207 case AtomicRMWInst::Sub:
19208 case AtomicRMWInst::And:
19209 case AtomicRMWInst::Or:
19210 case AtomicRMWInst::Xor:
19211 case AtomicRMWInst::Max:
19212 case AtomicRMWInst::Min:
19219 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
19221 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
19224 auto *IT = dyn_cast<IntegerType>(RMW->getType());
19225 if (!IT || IT->getBitWidth() != 32)
19227 }
19228
19231 if (Subtarget->hasEmulatedSystemScopeAtomics())
19233
19234 // On most subtargets, for atomicrmw operations other than add/xchg,
19235 // whether or not the instructions will behave correctly depends on where
19236 // the address physically resides and what interconnect is used in the
19237 // system configuration. On some some targets the instruction will nop,
19238 // and in others synchronization will only occur at degraded device scope.
19239 //
19240 // If the allocation is known local to the device, the instructions should
19241 // work correctly.
19242 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
19244
19245 // If fine-grained remote memory works at device scope, we don't need to
19246 // do anything.
19247 if (!HasSystemScope &&
19248 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19250
19251 // If we are targeting a remote allocated address, it depends what kind of
19252 // allocation the address belongs to.
19253 //
19254 // If the allocation is fine-grained (in host memory, or in PCIe peer
19255 // device memory), the operation will fail depending on the target.
19256 //
19257 // Note fine-grained host memory access does work on APUs or if XGMI is
19258 // used, but we do not know if we are targeting an APU or the system
19259 // configuration from the ISA version/target-cpu.
19260 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
19262
19265 // Atomic sub/or/xor do not work over PCI express, but atomic add
19266 // does. InstCombine transforms these with 0 to or, so undo that.
19267 if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
19268 ConstVal && ConstVal->isNullValue())
19270 }
19271
19272 // If the allocation could be in remote, fine-grained memory, the rmw
19273 // instructions may fail. cmpxchg should work, so emit that. On some
19274 // system configurations, PCIe atomics aren't supported so cmpxchg won't
19275 // even work, so you're out of luck anyway.
19276
19277 // In summary:
19278 //
19279 // Cases that may fail:
19280 // - fine-grained pinned host memory
19281 // - fine-grained migratable host memory
19282 // - fine-grained PCIe peer device
19283 //
19284 // Cases that should work, but may be treated overly conservatively.
19285 // - fine-grained host memory on an APU
19286 // - fine-grained XGMI peer device
19288 }
19289
19291 }
19292 case AtomicRMWInst::FAdd: {
19293 Type *Ty = RMW->getType();
19294
19295 // TODO: Handle REGION_ADDRESS
19296 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19297 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
19298 // is fixed to round-to-nearest-even.
19299 //
19300 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
19301 // round-to-nearest-even.
19302 //
19303 // We ignore the rounding mode problem, even in strictfp. The C++ standard
19304 // suggests it is OK if the floating-point mode may not match the calling
19305 // thread.
19306 if (Ty->isFloatTy()) {
19307 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
19309 }
19310
19311 if (Ty->isDoubleTy()) {
19312 // Ignores denormal mode, but we don't consider flushing mandatory.
19313 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
19315 }
19316
19317 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19319
19321 }
19322
19323 // LDS atomics respect the denormal mode from the mode register.
19324 //
19325 // Traditionally f32 global/buffer memory atomics would unconditionally
19326 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
19327 // flush.
19328 //
19329 // On targets with flat atomic fadd, denormals would flush depending on
19330 // whether the target address resides in LDS or global memory. We consider
19331 // this flat-maybe-flush as will-flush.
19332 if (Ty->isFloatTy() &&
19333 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19336
19337 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
19338 // safe. The message phrasing also should be better.
19339 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
19340 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19341 // gfx942, gfx12
19342 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19343 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19344 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
19345 // gfx90a, gfx942, gfx12
19346 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19347 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19348
19349 // gfx942, gfx12
19350 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
19351 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19352 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19353 // gfx90a, gfx942, gfx12
19354 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19355 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19356
19357 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
19358 // buffer. gfx12 does have the buffer version.
19359 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
19360 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19361 }
19362
19363 // global and flat atomic fadd f64: gfx90a, gfx942.
19364 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19365 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19366
19367 if (AS != AMDGPUAS::FLAT_ADDRESS) {
19368 if (Ty->isFloatTy()) {
19369 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
19370 // gfx11+.
19371 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19372 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19373 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
19374 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19375 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19376 } else {
19377 // gfx908
19378 if (RMW->use_empty() &&
19379 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19380 isV2F16(Ty))
19381 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19382 }
19383 }
19384
19385 // flat atomic fadd f32: gfx942, gfx11+.
19386 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
19387 if (Subtarget->hasFlatAtomicFaddF32Inst())
19388 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19389
19390 // If it is in flat address space, and the type is float, we will try to
19391 // expand it, if the target supports global and lds atomic fadd. The
19392 // reason we need that is, in the expansion, we emit the check of
19393 // address space. If it is in global address space, we emit the global
19394 // atomic fadd; if it is in shared address space, we emit the LDS atomic
19395 // fadd.
19396 if (Subtarget->hasLDSFPAtomicAddF32()) {
19397 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19399 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19401 }
19402 }
19403 }
19404
19406 }
19408 case AtomicRMWInst::FMax: {
19409 Type *Ty = RMW->getType();
19410
19411 // LDS float and double fmin/fmax were always supported.
19412 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19413 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
19415 }
19416
19417 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
19418 // For flat and global cases:
19419 // float, double in gfx7. Manual claims denormal support.
19420 // Removed in gfx8.
19421 // float, double restored in gfx10.
19422 // double removed again in gfx11, so only f32 for gfx11/gfx12.
19423 //
19424 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
19425 // no f32.
19426 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19427 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19428 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19429 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19430 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19431 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
19433 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19434 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19435 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19436 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19437 }
19438 }
19439
19441 }
19444 default:
19446 }
19447
19448 llvm_unreachable("covered atomicrmw op switch");
19449}
19450
19457
19464
19467 const AtomicCmpXchgInst *CmpX) const {
19468 unsigned AddrSpace = CmpX->getPointerAddressSpace();
19469 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
19471
19472 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
19474
19475 const DataLayout &DL = CmpX->getDataLayout();
19476
19477 Type *ValTy = CmpX->getNewValOperand()->getType();
19478
19479 // If a 64-bit flat atomic may alias private, we need to avoid using the
19480 // atomic in the private case.
19481 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
19483}
19484
19485const TargetRegisterClass *
19486SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
19488 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19489 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19490 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19491 : &AMDGPU::SReg_32RegClass;
19492 if (!TRI->isSGPRClass(RC) && !isDivergent)
19493 return TRI->getEquivalentSGPRClass(RC);
19494 if (TRI->isSGPRClass(RC) && isDivergent) {
19495 if (Subtarget->hasGFX90AInsts())
19496 return TRI->getEquivalentAVClass(RC);
19497 return TRI->getEquivalentVGPRClass(RC);
19498 }
19499
19500 return RC;
19501}
19502
19503// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19504// uniform values (as produced by the mask results of control flow intrinsics)
19505// used outside of divergent blocks. The phi users need to also be treated as
19506// always uniform.
19507//
19508// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19509static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
19510 unsigned WaveSize) {
19511 // FIXME: We assume we never cast the mask results of a control flow
19512 // intrinsic.
19513 // Early exit if the type won't be consistent as a compile time hack.
19514 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
19515 if (!IT || IT->getBitWidth() != WaveSize)
19516 return false;
19517
19518 if (!isa<Instruction>(V))
19519 return false;
19520 if (!Visited.insert(V).second)
19521 return false;
19522 bool Result = false;
19523 for (const auto *U : V->users()) {
19525 if (V == U->getOperand(1)) {
19526 switch (Intrinsic->getIntrinsicID()) {
19527 default:
19528 Result = false;
19529 break;
19530 case Intrinsic::amdgcn_if_break:
19531 case Intrinsic::amdgcn_if:
19532 case Intrinsic::amdgcn_else:
19533 Result = true;
19534 break;
19535 }
19536 }
19537 if (V == U->getOperand(0)) {
19538 switch (Intrinsic->getIntrinsicID()) {
19539 default:
19540 Result = false;
19541 break;
19542 case Intrinsic::amdgcn_end_cf:
19543 case Intrinsic::amdgcn_loop:
19544 Result = true;
19545 break;
19546 }
19547 }
19548 } else {
19549 Result = hasCFUser(U, Visited, WaveSize);
19550 }
19551 if (Result)
19552 break;
19553 }
19554 return Result;
19555}
19556
19558 const Value *V) const {
19559 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
19560 if (CI->isInlineAsm()) {
19561 // FIXME: This cannot give a correct answer. This should only trigger in
19562 // the case where inline asm returns mixed SGPR and VGPR results, used
19563 // outside the defining block. We don't have a specific result to
19564 // consider, so this assumes if any value is SGPR, the overall register
19565 // also needs to be SGPR.
19566 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19568 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
19569 for (auto &TC : TargetConstraints) {
19570 if (TC.Type == InlineAsm::isOutput) {
19572 const TargetRegisterClass *RC =
19573 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
19574 TC.ConstraintVT)
19575 .second;
19576 if (RC && SIRI->isSGPRClass(RC))
19577 return true;
19578 }
19579 }
19580 }
19581 }
19583 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19584}
19585
19587 for (SDUse &Use : N->uses()) {
19589 if (getBasePtrIndex(M) == Use.getOperandNo())
19590 return true;
19591 }
19592 }
19593 return false;
19594}
19595
19597 SDValue N1) const {
19598 if (!N0.hasOneUse())
19599 return false;
19600 // Take care of the opportunity to keep N0 uniform
19601 if (N0->isDivergent() || !N1->isDivergent())
19602 return true;
19603 // Check if we have a good chance to form the memory access pattern with the
19604 // base and offset
19605 return (DAG.isBaseWithConstantOffset(N0) &&
19607}
19608
19610 Register N0, Register N1) const {
19611 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
19612}
19613
19616 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19618 if (I.getMetadata("amdgpu.noclobber"))
19619 Flags |= MONoClobber;
19620 if (I.getMetadata("amdgpu.last.use"))
19621 Flags |= MOLastUse;
19622 return Flags;
19623}
19624
19626 Instruction *AI) const {
19627 // Given: atomicrmw fadd ptr %addr, float %val ordering
19628 //
19629 // With this expansion we produce the following code:
19630 // [...]
19631 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19632 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19633 //
19634 // atomicrmw.shared:
19635 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19636 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19637 // float %val ordering
19638 // br label %atomicrmw.phi
19639 //
19640 // atomicrmw.check.private:
19641 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19642 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19643 //
19644 // atomicrmw.private:
19645 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19646 // %loaded.private = load float, ptr addrspace(5) %cast.private
19647 // %val.new = fadd float %loaded.private, %val
19648 // store float %val.new, ptr addrspace(5) %cast.private
19649 // br label %atomicrmw.phi
19650 //
19651 // atomicrmw.global:
19652 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19653 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19654 // float %val ordering
19655 // br label %atomicrmw.phi
19656 //
19657 // atomicrmw.phi:
19658 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19659 // [ %loaded.private, %atomicrmw.private ],
19660 // [ %loaded.global, %atomicrmw.global ]
19661 // br label %atomicrmw.end
19662 //
19663 // atomicrmw.end:
19664 // [...]
19665 //
19666 //
19667 // For 64-bit atomics which may reside in private memory, we perform a simpler
19668 // version that only inserts the private check, and uses the flat operation.
19669
19670 IRBuilder<> Builder(AI);
19671 LLVMContext &Ctx = Builder.getContext();
19672
19673 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19674 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19676 Value *Addr = AI->getOperand(PtrOpIdx);
19677
19678 /// TODO: Only need to check private, then emit flat-known-not private (no
19679 /// need for shared block, or cast to global).
19681
19682 Align Alignment;
19683 if (RMW)
19684 Alignment = RMW->getAlign();
19685 else if (CX)
19686 Alignment = CX->getAlign();
19687 else
19688 llvm_unreachable("unhandled atomic operation");
19689
19690 // FullFlatEmulation is true if we need to issue the private, shared, and
19691 // global cases.
19692 //
19693 // If this is false, we are only dealing with the flat-targeting-private case,
19694 // where we only insert a check for private and still use the flat instruction
19695 // for global and shared.
19696
19697 bool FullFlatEmulation =
19698 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19699 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19700 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19701 RMW->getType()->isDoubleTy()));
19702
19703 // If the return value isn't used, do not introduce a false use in the phi.
19704 bool ReturnValueIsUsed = !AI->use_empty();
19705
19706 BasicBlock *BB = Builder.GetInsertBlock();
19707 Function *F = BB->getParent();
19708 BasicBlock *ExitBB =
19709 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19710 BasicBlock *SharedBB = nullptr;
19711
19712 BasicBlock *CheckPrivateBB = BB;
19713 if (FullFlatEmulation) {
19714 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19715 CheckPrivateBB =
19716 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19717 }
19718
19719 BasicBlock *PrivateBB =
19720 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19721 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19722 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19723
19724 std::prev(BB->end())->eraseFromParent();
19725 Builder.SetInsertPoint(BB);
19726
19727 Value *LoadedShared = nullptr;
19728 if (FullFlatEmulation) {
19729 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19730 {Addr}, nullptr, "is.shared");
19731 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19732 Builder.SetInsertPoint(SharedBB);
19733 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19735
19736 Instruction *Clone = AI->clone();
19737 Clone->insertInto(SharedBB, SharedBB->end());
19738 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19739 LoadedShared = Clone;
19740
19741 Builder.CreateBr(PhiBB);
19742 Builder.SetInsertPoint(CheckPrivateBB);
19743 }
19744
19745 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19746 {Addr}, nullptr, "is.private");
19747 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19748
19749 Builder.SetInsertPoint(PrivateBB);
19750
19751 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19753
19754 Value *LoadedPrivate;
19755 if (RMW) {
19756 LoadedPrivate = Builder.CreateAlignedLoad(
19757 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19758
19759 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19760 LoadedPrivate, RMW->getValOperand());
19761
19762 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19763 } else {
19764 auto [ResultLoad, Equal] =
19765 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19766 CX->getNewValOperand(), CX->getAlign());
19767
19768 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19769 ResultLoad, 0);
19770 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19771 }
19772
19773 Builder.CreateBr(PhiBB);
19774
19775 Builder.SetInsertPoint(GlobalBB);
19776
19777 // Continue using a flat instruction if we only emitted the check for private.
19778 Instruction *LoadedGlobal = AI;
19779 if (FullFlatEmulation) {
19780 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19782 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19783 }
19784
19785 AI->removeFromParent();
19786 AI->insertInto(GlobalBB, GlobalBB->end());
19787
19788 // The new atomicrmw may go through another round of legalization later.
19789 if (!FullFlatEmulation) {
19790 // We inserted the runtime check already, make sure we do not try to
19791 // re-expand this.
19792 // TODO: Should union with any existing metadata.
19793 MDBuilder MDB(F->getContext());
19794 MDNode *RangeNotPrivate =
19797 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19798 RangeNotPrivate);
19799 }
19800
19801 Builder.CreateBr(PhiBB);
19802
19803 Builder.SetInsertPoint(PhiBB);
19804
19805 if (ReturnValueIsUsed) {
19806 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19807 AI->replaceAllUsesWith(Loaded);
19808 if (FullFlatEmulation)
19809 Loaded->addIncoming(LoadedShared, SharedBB);
19810 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19811 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19812 Loaded->takeName(AI);
19813 }
19814
19815 Builder.CreateBr(ExitBB);
19816}
19817
19819 unsigned PtrOpIdx) {
19820 Value *PtrOp = I->getOperand(PtrOpIdx);
19823
19824 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19825 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19826 I->getIterator());
19827 I->setOperand(PtrOpIdx, ASCast);
19828}
19829
19832
19835
19838 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19839 ConstVal && ConstVal->isNullValue()) {
19840 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19842
19843 // We may still need the private-alias-flat handling below.
19844
19845 // TODO: Skip this for cases where we cannot access remote memory.
19846 }
19847 }
19848
19849 // The non-flat expansions should only perform the de-canonicalization of
19850 // identity values.
19852 return;
19853
19855}
19856
19863
19867
19869 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19870}
19871
19873 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19874 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19875
19877 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19878}
19879
19880LoadInst *
19882 IRBuilder<> Builder(AI);
19883 auto Order = AI->getOrdering();
19884
19885 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19886 // must be flushed if the atomic ordering had a release semantics. This is
19887 // not necessary a fence, a release fence just coincides to do that flush.
19888 // Avoid replacing of an atomicrmw with a release semantics.
19889 if (isReleaseOrStronger(Order))
19890 return nullptr;
19891
19892 LoadInst *LI = Builder.CreateAlignedLoad(
19893 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19894 LI->setAtomic(Order, AI->getSyncScopeID());
19895 LI->copyMetadata(*AI);
19896 LI->takeName(AI);
19897 AI->replaceAllUsesWith(LI);
19898 AI->eraseFromParent();
19899 return LI;
19900}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1268
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1265
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1175
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5976
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1564
bool isNegative() const
Definition APFloat.h:1516
bool isNormal() const
Definition APFloat.h:1520
APInt bitcastToAPInt() const
Definition APFloat.h:1408
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1193
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1153
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1134
bool isInfinity() const
Definition APFloat.h:1513
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1406
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1400
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1654
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:338
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:215
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:764
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:805
Argument * getArg(unsigned i) const
Definition Function.h:886
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool isWave64() const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:133
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:561
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1080
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:229
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:223
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:226
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:429
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:907
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:427
bool use_empty() const
Definition Value.h:347
iterator_range< use_iterator > uses()
Definition Value.h:381
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779