LLVM 23.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
45#include "llvm/IR/MDBuilder.h"
48#include "llvm/Support/ModRef.h"
50#include <optional>
51
52using namespace llvm;
53using namespace llvm::SDPatternMatch;
54
55#define DEBUG_TYPE "si-lower"
56
57STATISTIC(NumTailCalls, "Number of tail calls");
58
59static cl::opt<bool>
60 DisableLoopAlignment("amdgpu-disable-loop-alignment",
61 cl::desc("Do not align and prefetch loops"),
62 cl::init(false));
63
65 "amdgpu-use-divergent-register-indexing", cl::Hidden,
66 cl::desc("Use indirect register addressing for divergent indexes"),
67 cl::init(false));
68
73
78
79static unsigned findFirstFreeSGPR(CCState &CCInfo) {
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
82 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
83 return AMDGPU::SGPR0 + Reg;
84 }
85 }
86 llvm_unreachable("Cannot allocate sgpr");
87}
88
90 const GCNSubtarget &STI)
91 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
92 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
93 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
94
95 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V32RegClass =
99 TRI->getDefaultVectorSuperClassForBitWidth(32);
100 addRegisterClass(MVT::f32, V32RegClass);
101
102 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
103
104 const TargetRegisterClass *V64RegClass =
105 TRI->getDefaultVectorSuperClassForBitWidth(64);
106
107 addRegisterClass(MVT::f64, V64RegClass);
108 addRegisterClass(MVT::v2f32, V64RegClass);
109 addRegisterClass(MVT::Untyped, V64RegClass);
110
111 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
112 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
113
114 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
115 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
116
117 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
118 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
119
120 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
121 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
122
123 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
124 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
125
126 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
127 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
128
129 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
130 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
131
132 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
133 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
134
135 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
136 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
137
138 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
139 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
140
141 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
142 addRegisterClass(MVT::v10f32,
143 TRI->getDefaultVectorSuperClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32,
147 TRI->getDefaultVectorSuperClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32,
151 TRI->getDefaultVectorSuperClassForBitWidth(384));
152
153 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
154 addRegisterClass(MVT::v16f32,
155 TRI->getDefaultVectorSuperClassForBitWidth(512));
156
157 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
158 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
159
160 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
161 addRegisterClass(MVT::v16f64,
162 TRI->getDefaultVectorSuperClassForBitWidth(1024));
163
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
166 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
169 } else {
170 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
171 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
173 }
174
175 // Unless there are also VOP3P operations, not operations are really legal.
176 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
177 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
180 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
183 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
186 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
189 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
191 }
192
193 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
194 addRegisterClass(MVT::v32f32,
195 TRI->getDefaultVectorSuperClassForBitWidth(1024));
196
197 computeRegisterProperties(Subtarget->getRegisterInfo());
198
199 // The boolean content concept here is too inflexible. Compares only ever
200 // really produce a 1-bit result. Any copy/extend from these will turn into a
201 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
202 // it's what most targets use.
205
206 // We need to custom lower vector stores from local memory
208 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
209 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
210 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
211 MVT::i1, MVT::v32i32},
212 Custom);
213
215 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
216 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
217 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
218 MVT::i1, MVT::v32i32},
219 Custom);
220
221 if (isTypeLegal(MVT::bf16)) {
222 for (unsigned Opc :
231 ISD::SETCC}) {
232 setOperationAction(Opc, MVT::bf16, Promote);
233 }
234
236
238 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
239
243
244 // We only need to custom lower because we can't specify an action for bf16
245 // sources.
248 }
249
250 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
251 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
252 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
253 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
254 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
255 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
256 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
257 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
258 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
259 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
260 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
261 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
262 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
263 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
264 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
265 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
266
267 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
268 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
269 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
270 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
271 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
272 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
273 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
274
275 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
276 setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
277
281 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
282
283 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
284
286 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
287
289 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
290 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
291
293 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
294 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
295 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
296 Expand);
298 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
299 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
300 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
301 Expand);
302
304 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
305 MVT::v3i16, MVT::v4i16, MVT::Other},
306 Custom);
307
310 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
311
313
315
317 Expand);
318
319#if 0
321#endif
322
323 // We only support LOAD/STORE and vector manipulation ops for vectors
324 // with > 4 elements.
325 for (MVT VT :
326 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
327 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
328 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
329 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
330 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
331 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
332 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
333 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
334 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
335 switch (Op) {
336 case ISD::LOAD:
337 case ISD::STORE:
339 case ISD::BITCAST:
340 case ISD::UNDEF:
344 case ISD::IS_FPCLASS:
345 break;
350 break;
351 default:
353 break;
354 }
355 }
356 }
357
359
360 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
361 // is expanded to avoid having two separate loops in case the index is a VGPR.
362
363 // Most operations are naturally 32-bit vector operations. We only support
364 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
365 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
377 }
378
379 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
391 }
392
393 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
405 }
406
407 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
419 }
420
421 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
423 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
424
426 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
430
432 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
433 }
434
436 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
437 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
438 Custom);
439
440 if (Subtarget->hasPkMovB32()) {
441 // TODO: 16-bit element vectors should be legal with even aligned elements.
442 // TODO: Can be legal with wider source types than the result with
443 // subregister extracts.
444 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
445 }
446
448 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
449 // instead lower to cndmask in SITargetLowering::LowerSELECT().
451 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
452 // alignbit.
453 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
454
455 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
456 Custom);
457
458 // Avoid stack access for these.
459 // TODO: Generalize to more vector types.
461 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
462 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
463 Custom);
464
465 // Deal with vec3 vector operations when widened to vec4.
467 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
468
469 // Deal with vec5/6/7 vector operations when widened to vec8.
471 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
472 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
473 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
474 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
475 Custom);
476
477 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
478 // and output demarshalling
479 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
480
481 // We can't return success/failure, only the old value,
482 // let LLVM add the comparison
484 Expand);
485
486 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
487
488 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
489
490 // FIXME: This should be narrowed to i32, but that only happens if i64 is
491 // illegal.
492 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
493 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
494
495 // On SI this is s_memtime and s_memrealtime on VI.
497
498 if (Subtarget->hasSMemRealTime() ||
499 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
502
503 if (Subtarget->has16BitInsts()) {
506 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
509 } else {
511 }
512
513 if (Subtarget->hasMadMacF32Insts())
515
518
519 // We only really have 32-bit BFE instructions (and 16-bit on VI).
520 //
521 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
522 // effort to match them now. We want this to be false for i64 cases when the
523 // extraction isn't restricted to the upper or lower half. Ideally we would
524 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
525 // span the midpoint are probably relatively rare, so don't worry about them
526 // for now.
528
529 // Clamp modifier on add/sub
530 if (Subtarget->hasIntClamp())
532
533 if (Subtarget->hasAddNoCarryInsts())
534 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
535 Legal);
536
539 {MVT::f32, MVT::f64}, Custom);
540
541 // These are really only legal for ieee_mode functions. We should be avoiding
542 // them for functions that don't have ieee_mode enabled, so just say they are
543 // legal.
545 {MVT::f32, MVT::f64}, Legal);
546
547 if (Subtarget->haveRoundOpsF64())
549 Legal);
550 else
552 MVT::f64, Custom);
553
555 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
556 Legal);
557 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
558
561
562 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
564
566 Custom);
568 Custom);
570 Custom);
571
572 // Custom lower these because we can't specify a rule based on an illegal
573 // source bf16.
576
577 if (Subtarget->has16BitInsts()) {
580 MVT::i16, Legal);
581
582 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
583
585 MVT::i16, Expand);
586
590 ISD::CTPOP},
591 MVT::i16, Promote);
592
594
595 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
596
598 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
600 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
601
605
607
608 // F16 - Constant Actions.
611
612 // F16 - Load/Store Actions.
614 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
616 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
617
618 // BF16 - Load/Store Actions.
620 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
622 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
623
624 // F16 - VOP1 Actions.
627 MVT::f16, Custom);
628
629 // BF16 - VOP1 Actions.
630 if (Subtarget->hasBF16TransInsts())
632
635 MVT::f16, Promote);
638 MVT::bf16, Promote);
639
640 // F16 - VOP2 Actions.
641 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
642 Expand);
646
647 // F16 - VOP3 Actions.
649 if (STI.hasMadF16())
651
652 for (MVT VT :
653 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
654 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
655 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
656 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
657 switch (Op) {
658 case ISD::LOAD:
659 case ISD::STORE:
661 case ISD::BITCAST:
662 case ISD::UNDEF:
667 case ISD::IS_FPCLASS:
668 break;
671 case ISD::FSIN:
672 case ISD::FCOS:
674 break;
675 default:
677 break;
678 }
679 }
680 }
681
682 // v_perm_b32 can handle either of these.
683 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
685
686 // XXX - Do these do anything? Vector constants turn into build_vector.
687 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
688
689 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
690 Legal);
691
693 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
695 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
696
698 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
700 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
701
702 setOperationAction(ISD::AND, MVT::v2i16, Promote);
703 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
704 setOperationAction(ISD::OR, MVT::v2i16, Promote);
705 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
706 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
707 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
708
710 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
712 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
713 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
715
717 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
719 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
721 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
722
724 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
726 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
727 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
728 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
729
731 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
733 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
734
736 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
738 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
740 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
741
742 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
744 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
746 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
748
750 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
752 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
753 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
754 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
755
756 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
757 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
758 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
759 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
760 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
761 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
762
764 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
766 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
767 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
768 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
769
771 MVT::v2i32, Expand);
773
775 MVT::v4i32, Expand);
776
778 MVT::v8i32, Expand);
779
780 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
781 Subtarget->hasVOP3PInsts() ? Legal : Custom);
782
783 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
784 // This isn't really legal, but this avoids the legalizer unrolling it (and
785 // allows matching fneg (fabs x) patterns)
786 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
787
788 // Can do this in one BFI plus a constant materialize.
790 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
791 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
792 MVT::v32f16, MVT::v32bf16},
793 Custom);
794
797 MVT::f16, Custom);
799
802 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
803 Custom);
804
806 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
807 Expand);
808
809 for (MVT Vec16 :
810 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
811 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
814 Vec16, Custom);
816 }
817 }
818
819 if (Subtarget->hasVOP3PInsts()) {
823 MVT::v2i16, Legal);
824
827 MVT::v2f16, Legal);
828
830 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
831
833 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
834 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
835 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
836 Custom);
837
838 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
839 // Split vector operations.
844 VT, Custom);
845
846 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
847 // Split vector operations.
849 VT, Custom);
850
853 {MVT::v2f16, MVT::v4f16}, Custom);
854
855 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
856 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
857 Custom);
858
859 if (Subtarget->hasBF16PackedInsts()) {
860 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
861 // Split vector operations.
863 VT, Custom);
864 }
865
866 if (Subtarget->hasPackedFP32Ops()) {
868 MVT::v2f32, Legal);
870 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
871 Custom);
872 }
873 }
874
876
877 if (Subtarget->has16BitInsts()) {
879 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
881 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
882 } else {
883 // Legalization hack.
884 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
885
887 }
888
890 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
891 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
892 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
893 MVT::v32f16, MVT::v32bf16},
894 Custom);
895
897
898 if (Subtarget->hasVectorMulU64())
900 else if (Subtarget->hasScalarSMulU64())
902
903 if (Subtarget->hasMad64_32())
905
906 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
908
909 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
911 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
912 } else {
913 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
914 if (Subtarget->hasMinimum3Maximum3F32())
916
917 if (Subtarget->hasMinimum3Maximum3PKF16()) {
919
920 // If only the vector form is available, we need to widen to a vector.
921 if (!Subtarget->hasMinimum3Maximum3F16())
923 }
924 }
925
926 if (Subtarget->hasVOP3PInsts()) {
927 // We want to break these into v2f16 pieces, not scalarize.
929 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
930 Custom);
931 }
932
933 if (Subtarget->hasIntMinMax64())
935 Legal);
936
938 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
939 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
940 MVT::i8},
941 Custom);
942
944 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
945 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
946 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
947 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
948 Custom);
949
951 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
952 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
953 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
954 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
955 Custom);
956
962
963 // TODO: Could move this to custom lowering, could benefit from combines on
964 // extract of relevant bits.
966
968
969 if (Subtarget->hasBF16ConversionInsts()) {
970 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
972 }
973
974 if (Subtarget->hasBF16PackedInsts()) {
977 MVT::v2bf16, Legal);
978 }
979
980 if (Subtarget->hasBF16TransInsts()) {
982 }
983
984 if (Subtarget->hasCvtPkF16F32Inst()) {
986 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
987 Custom);
988 }
989
993 ISD::SUB,
995 ISD::MUL,
996 ISD::FADD,
997 ISD::FSUB,
998 ISD::FDIV,
999 ISD::FMUL,
1008 ISD::FMA,
1009 ISD::SMIN,
1010 ISD::SMAX,
1011 ISD::UMIN,
1012 ISD::UMAX,
1013 ISD::SETCC,
1015 ISD::SMIN,
1016 ISD::SMAX,
1017 ISD::UMIN,
1018 ISD::UMAX,
1019 ISD::AND,
1020 ISD::OR,
1021 ISD::XOR,
1022 ISD::SHL,
1023 ISD::SRL,
1024 ISD::SRA,
1025 ISD::FSHR,
1036
1037 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1039
1040 // All memory operations. Some folding on the pointer operand is done to help
1041 // matching the constant offsets in the addressing modes.
1043 ISD::STORE,
1068
1069 // FIXME: In other contexts we pretend this is a per-function property.
1071
1073}
1074
1075const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1076
1078 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1079 return RCRegs;
1080}
1081
1082//===----------------------------------------------------------------------===//
1083// TargetLowering queries
1084//===----------------------------------------------------------------------===//
1085
1086// v_mad_mix* support a conversion from f16 to f32.
1087//
1088// There is only one special case when denormals are enabled we don't currently,
1089// where this is OK to use.
1090bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1091 EVT DestVT, EVT SrcVT) const {
1092 return DestVT.getScalarType() == MVT::f32 &&
1093 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1094 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1095 SrcVT.getScalarType() == MVT::f16) ||
1096 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1097 SrcVT.getScalarType() == MVT::bf16)) &&
1098 // TODO: This probably only requires no input flushing?
1100}
1101
1103 LLT DestTy, LLT SrcTy) const {
1104 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1105 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1106 DestTy.getScalarSizeInBits() == 32 &&
1107 SrcTy.getScalarSizeInBits() == 16 &&
1108 // TODO: This probably only requires no input flushing?
1109 denormalModeIsFlushAllF32(*MI.getMF());
1110}
1111
1113 // SI has some legal vector types, but no legal vector operations. Say no
1114 // shuffles are legal in order to prefer scalarizing some vector operations.
1115 return false;
1116}
1117
1119 CallingConv::ID CC,
1120 EVT VT) const {
1122 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1123
1124 if (VT.isVector()) {
1125 EVT ScalarVT = VT.getScalarType();
1126 unsigned Size = ScalarVT.getSizeInBits();
1127 if (Size == 16) {
1128 return Subtarget->has16BitInsts()
1129 ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2)
1130 : MVT::i32;
1131 }
1132
1133 if (Size < 16)
1134 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1135 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1136 }
1137
1138 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1139 return MVT::i32;
1140
1141 if (VT.getSizeInBits() > 32)
1142 return MVT::i32;
1143
1144 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1145}
1146
1148 CallingConv::ID CC,
1149 EVT VT) const {
1151 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1152
1153 if (VT.isVector()) {
1154 unsigned NumElts = VT.getVectorNumElements();
1155 EVT ScalarVT = VT.getScalarType();
1156 unsigned Size = ScalarVT.getSizeInBits();
1157
1158 // FIXME: Should probably promote 8-bit vectors to i16.
1159 if (Size == 16)
1160 return (NumElts + 1) / 2;
1161
1162 if (Size <= 32)
1163 return NumElts;
1164
1165 if (Size > 32)
1166 return NumElts * ((Size + 31) / 32);
1167 } else if (VT.getSizeInBits() > 32)
1168 return (VT.getSizeInBits() + 31) / 32;
1169
1170 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1171}
1172
1174 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1175 unsigned &NumIntermediates, MVT &RegisterVT) const {
1176 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1177 unsigned NumElts = VT.getVectorNumElements();
1178 EVT ScalarVT = VT.getScalarType();
1179 unsigned Size = ScalarVT.getSizeInBits();
1180 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1181 // support, but unless we can properly handle 3-vectors, it will be still be
1182 // inconsistent.
1183 if (Size == 16) {
1184 MVT SimpleIntermediateVT =
1186 IntermediateVT = SimpleIntermediateVT;
1187 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1188 NumIntermediates = (NumElts + 1) / 2;
1189 return (NumElts + 1) / 2;
1190 }
1191
1192 if (Size == 32) {
1193 RegisterVT = ScalarVT.getSimpleVT();
1194 IntermediateVT = RegisterVT;
1195 NumIntermediates = NumElts;
1196 return NumIntermediates;
1197 }
1198
1199 if (Size < 16 && Subtarget->has16BitInsts()) {
1200 // FIXME: Should probably form v2i16 pieces
1201 RegisterVT = MVT::i16;
1202 IntermediateVT = ScalarVT;
1203 NumIntermediates = NumElts;
1204 return NumIntermediates;
1205 }
1206
1207 if (Size != 16 && Size <= 32) {
1208 RegisterVT = MVT::i32;
1209 IntermediateVT = ScalarVT;
1210 NumIntermediates = NumElts;
1211 return NumIntermediates;
1212 }
1213
1214 if (Size > 32) {
1215 RegisterVT = MVT::i32;
1216 IntermediateVT = RegisterVT;
1217 NumIntermediates = NumElts * ((Size + 31) / 32);
1218 return NumIntermediates;
1219 }
1220 }
1221
1223 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1224}
1225
1227 const DataLayout &DL, Type *Ty,
1228 unsigned MaxNumLanes) {
1229 assert(MaxNumLanes != 0);
1230
1231 LLVMContext &Ctx = Ty->getContext();
1232 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1233 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1234 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1235 NumElts);
1236 }
1237
1238 return TLI.getValueType(DL, Ty);
1239}
1240
1241// Peek through TFE struct returns to only use the data size.
1243 const DataLayout &DL, Type *Ty,
1244 unsigned MaxNumLanes) {
1245 auto *ST = dyn_cast<StructType>(Ty);
1246 if (!ST)
1247 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1248
1249 // TFE intrinsics return an aggregate type.
1250 assert(ST->getNumContainedTypes() == 2 &&
1251 ST->getContainedType(1)->isIntegerTy(32));
1252 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1253}
1254
1255/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1256/// in-memory representation. This return value is a custom type because there
1257/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1258/// could cause issues during codegen, these address space 7 pointers will be
1259/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1260/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1261/// for cost modeling, to work. (This also sets us up decently for doing the
1262/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1264 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1265 return MVT::amdgpuBufferFatPointer;
1267 DL.getPointerSizeInBits(AS) == 192)
1268 return MVT::amdgpuBufferStridedPointer;
1270}
1271/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1272/// v8i32 when padding is added.
1273/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1274/// also v8i32 with padding.
1276 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1277 DL.getPointerSizeInBits(AS) == 160) ||
1279 DL.getPointerSizeInBits(AS) == 192))
1280 return MVT::v8i32;
1282}
1283
1284static unsigned getIntrMemWidth(unsigned IntrID) {
1285 switch (IntrID) {
1286 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1287 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1288 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1289 return 8;
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1295 case Intrinsic::amdgcn_flat_load_monitor_b32:
1296 case Intrinsic::amdgcn_global_load_monitor_b32:
1297 return 32;
1298 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1300 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1301 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1302 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1303 case Intrinsic::amdgcn_flat_load_monitor_b64:
1304 case Intrinsic::amdgcn_global_load_monitor_b64:
1305 return 64;
1306 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1307 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1308 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1309 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1310 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1311 case Intrinsic::amdgcn_flat_load_monitor_b128:
1312 case Intrinsic::amdgcn_global_load_monitor_b128:
1313 return 128;
1314 default:
1315 llvm_unreachable("Unknown width");
1316 }
1317}
1318
1320 unsigned ArgIdx) {
1321 Value *OrderingArg = CI.getArgOperand(ArgIdx);
1322 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1323 switch (AtomicOrderingCABI(Ord)) {
1326 break;
1329 break;
1332 break;
1333 default:
1335 }
1336}
1337
1338static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
1339 MDNode *ScopeMD = cast<MDNode>(
1340 cast<MetadataAsValue>(CI.getArgOperand(ArgIdx))->getMetadata());
1341 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1342 return CI.getContext().getOrInsertSyncScopeID(Scope);
1343}
1344
1346 const CallBase &CI,
1347 MachineFunction &MF,
1348 unsigned IntrID) const {
1350 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1352 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1354 Flags |= getTargetMMOFlags(CI);
1355
1356 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1358 AttributeSet Attr =
1360 MemoryEffects ME = Attr.getMemoryEffects();
1361 if (ME.doesNotAccessMemory())
1362 return;
1363
1364 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1365 if (!IsSPrefetch) {
1366 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1367 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1369 }
1371
1372 IntrinsicInfo Info;
1373 // TODO: Should images get their own address space?
1375
1376 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1377 if (RsrcIntr->IsImage) {
1378 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1380 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1381 Info.align.reset();
1382 }
1383
1384 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1385 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1386 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1387 // We conservatively set the memory operand of a buffer intrinsic to the
1388 // base resource pointer, so that we can access alias information about
1389 // those pointers. Cases like "this points at the same value
1390 // but with a different offset" are handled in
1391 // areMemAccessesTriviallyDisjoint.
1392 Info.ptrVal = RsrcArg;
1393 }
1394
1395 if (ME.onlyReadsMemory()) {
1396 if (RsrcIntr->IsImage) {
1397 unsigned MaxNumLanes = 4;
1398
1399 if (!BaseOpcode->Gather4) {
1400 // If this isn't a gather, we may have excess loaded elements in the
1401 // IR type. Check the dmask for the real number of elements loaded.
1402 unsigned DMask =
1403 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1404 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1405 }
1406
1407 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1408 CI.getType(), MaxNumLanes);
1409 } else {
1410 Info.memVT =
1412 std::numeric_limits<unsigned>::max());
1413 }
1414
1415 // FIXME: What does alignment mean for an image?
1416 Info.opc = ISD::INTRINSIC_W_CHAIN;
1417 Info.flags = Flags | MachineMemOperand::MOLoad;
1418 } else if (ME.onlyWritesMemory()) {
1419 Info.opc = ISD::INTRINSIC_VOID;
1420
1421 Type *DataTy = CI.getArgOperand(0)->getType();
1422 if (RsrcIntr->IsImage) {
1423 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1424 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1425 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1426 DMaskLanes);
1427 } else
1428 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1429
1430 Info.flags = Flags | MachineMemOperand::MOStore;
1431 } else {
1432 // Atomic, NoReturn Sampler or prefetch
1433 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1435
1436 switch (IntrID) {
1437 default:
1438 Info.flags = Flags | MachineMemOperand::MOLoad;
1439 if (!IsSPrefetch)
1440 Info.flags |= MachineMemOperand::MOStore;
1441
1442 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1443 // Fake memory access type for no return sampler intrinsics
1444 Info.memVT = MVT::i32;
1445 } else {
1446 // XXX - Should this be volatile without known ordering?
1447 Info.flags |= MachineMemOperand::MOVolatile;
1448 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1449 }
1450 break;
1451 case Intrinsic::amdgcn_raw_buffer_load_lds:
1452 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1453 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1454 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1455 case Intrinsic::amdgcn_struct_buffer_load_lds:
1456 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1457 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1458 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1459 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1460
1461 // Entry 0: Load from buffer.
1462 // Don't set an offset, since the pointer value always represents the
1463 // base of the buffer.
1464 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1465 Info.flags = Flags | MachineMemOperand::MOLoad;
1466 Infos.push_back(Info);
1467
1468 // Entry 1: Store to LDS.
1469 // Instruction offset is applied, and an additional per-lane offset
1470 // which we simulate using a larger memory type.
1471 Info.memVT = EVT::getIntegerVT(
1472 CI.getContext(), Width * 8 * Subtarget->getWavefrontSize());
1473 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1474 Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))
1475 ->getZExtValue();
1476 Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
1477 Info.flags = Flags | MachineMemOperand::MOStore;
1478 Infos.push_back(Info);
1479 return;
1480 }
1481 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1482 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1483 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1484 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1485 Info.memVT =
1487 std::numeric_limits<unsigned>::max());
1488 Info.flags = Flags | MachineMemOperand::MOLoad;
1489 Infos.push_back(Info);
1490 return;
1491 }
1492 }
1493 }
1494 Infos.push_back(Info);
1495 return;
1496 }
1497
1498 IntrinsicInfo Info;
1499 switch (IntrID) {
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap: {
1502 Info.opc = ISD::INTRINSIC_W_CHAIN;
1503 Info.memVT = MVT::getVT(CI.getType());
1504 Info.ptrVal = CI.getOperand(0);
1505 Info.align.reset();
1507
1508 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1509 if (!Vol->isZero())
1510 Info.flags |= MachineMemOperand::MOVolatile;
1511
1512 Infos.push_back(Info);
1513 return;
1514 }
1515 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1516 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1517 Info.opc = ISD::INTRINSIC_W_CHAIN;
1518 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1519 Info.ptrVal = nullptr;
1520 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1522 Infos.push_back(Info);
1523 return;
1524 }
1525 case Intrinsic::amdgcn_ds_append:
1526 case Intrinsic::amdgcn_ds_consume: {
1527 Info.opc = ISD::INTRINSIC_W_CHAIN;
1528 Info.memVT = MVT::getVT(CI.getType());
1529 Info.ptrVal = CI.getOperand(0);
1530 Info.align.reset();
1532
1533 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1534 if (!Vol->isZero())
1535 Info.flags |= MachineMemOperand::MOVolatile;
1536
1537 Infos.push_back(Info);
1538 return;
1539 }
1540 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1541 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1542 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1545 Info.memVT = MVT::getVT(CI.getType());
1546 Info.ptrVal = CI.getOperand(0);
1547 Info.memVT = MVT::i64;
1548 Info.size = 8;
1549 Info.align.reset();
1551 Infos.push_back(Info);
1552 return;
1553 }
1554 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1555 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1556 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1557 Info.opc = ISD::INTRINSIC_W_CHAIN;
1558 Info.memVT =
1559 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1560 ? CI.getType()
1562 ->getElementType(0)); // XXX: what is correct VT?
1563
1564 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1565 Info.align.reset();
1566 Info.flags = Flags | MachineMemOperand::MOLoad |
1568 Infos.push_back(Info);
1569 return;
1570 }
1571 case Intrinsic::amdgcn_global_atomic_fmin_num:
1572 case Intrinsic::amdgcn_global_atomic_fmax_num:
1573 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1574 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1575 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1576 Info.opc = ISD::INTRINSIC_W_CHAIN;
1577 Info.memVT = MVT::getVT(CI.getType());
1578 Info.ptrVal = CI.getOperand(0);
1579 Info.align.reset();
1580 Info.flags =
1583 Infos.push_back(Info);
1584 return;
1585 }
1586 case Intrinsic::amdgcn_cluster_load_b32:
1587 case Intrinsic::amdgcn_cluster_load_b64:
1588 case Intrinsic::amdgcn_cluster_load_b128:
1589 case Intrinsic::amdgcn_ds_load_tr6_b96:
1590 case Intrinsic::amdgcn_ds_load_tr4_b64:
1591 case Intrinsic::amdgcn_ds_load_tr8_b64:
1592 case Intrinsic::amdgcn_ds_load_tr16_b128:
1593 case Intrinsic::amdgcn_global_load_tr6_b96:
1594 case Intrinsic::amdgcn_global_load_tr4_b64:
1595 case Intrinsic::amdgcn_global_load_tr_b64:
1596 case Intrinsic::amdgcn_global_load_tr_b128:
1597 case Intrinsic::amdgcn_ds_read_tr4_b64:
1598 case Intrinsic::amdgcn_ds_read_tr6_b96:
1599 case Intrinsic::amdgcn_ds_read_tr8_b64:
1600 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1601 Info.opc = ISD::INTRINSIC_W_CHAIN;
1602 Info.memVT = MVT::getVT(CI.getType());
1603 Info.ptrVal = CI.getOperand(0);
1604 Info.align.reset();
1605 Info.flags = Flags | MachineMemOperand::MOLoad;
1606 Infos.push_back(Info);
1607 return;
1608 }
1609 case Intrinsic::amdgcn_flat_load_monitor_b32:
1610 case Intrinsic::amdgcn_flat_load_monitor_b64:
1611 case Intrinsic::amdgcn_flat_load_monitor_b128:
1612 case Intrinsic::amdgcn_global_load_monitor_b32:
1613 case Intrinsic::amdgcn_global_load_monitor_b64:
1614 case Intrinsic::amdgcn_global_load_monitor_b128: {
1615 Info.opc = ISD::INTRINSIC_W_CHAIN;
1616 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1617 Info.ptrVal = CI.getOperand(0);
1618 Info.align.reset();
1619 Info.flags = MachineMemOperand::MOLoad;
1620 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1621 Info.ssid = parseSyncscopeMDArg(CI, 2);
1622 Infos.push_back(Info);
1623 return;
1624 }
1625 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1626 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1627 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1628 Info.opc = ISD::INTRINSIC_W_CHAIN;
1629 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1630 Info.ptrVal = CI.getOperand(0);
1631 Info.align.reset();
1633 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1634 Info.ssid = parseSyncscopeMDArg(CI, 2);
1635 Infos.push_back(Info);
1636 return;
1637 }
1638 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1639 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1640 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1641 Info.opc = ISD::INTRINSIC_VOID;
1642 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1643 Info.ptrVal = CI.getArgOperand(0);
1644 Info.align.reset();
1646 Info.order = parseAtomicOrderingCABIArg(CI, 2);
1647 Info.ssid = parseSyncscopeMDArg(CI, 3);
1648 Infos.push_back(Info);
1649 return;
1650 }
1651 case Intrinsic::amdgcn_ds_gws_init:
1652 case Intrinsic::amdgcn_ds_gws_barrier:
1653 case Intrinsic::amdgcn_ds_gws_sema_v:
1654 case Intrinsic::amdgcn_ds_gws_sema_br:
1655 case Intrinsic::amdgcn_ds_gws_sema_p:
1656 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1657 Info.opc = ISD::INTRINSIC_VOID;
1658
1659 const GCNTargetMachine &TM =
1660 static_cast<const GCNTargetMachine &>(getTargetMachine());
1661
1663 Info.ptrVal = MFI->getGWSPSV(TM);
1664
1665 // This is an abstract access, but we need to specify a type and size.
1666 Info.memVT = MVT::i32;
1667 Info.size = 4;
1668 Info.align = Align(4);
1669
1670 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1671 Info.flags = Flags | MachineMemOperand::MOLoad;
1672 else
1673 Info.flags = Flags | MachineMemOperand::MOStore;
1674 Infos.push_back(Info);
1675 return;
1676 }
1677 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1678 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1679 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1680 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1681 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1682 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1683 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1684 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1685 // Entry 0: Load from source (global/flat).
1686 Info.opc = ISD::INTRINSIC_VOID;
1687 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1688 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1689 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1690 Info.flags = Flags | MachineMemOperand::MOLoad;
1691 Infos.push_back(Info);
1692
1693 // Entry 1: Store to LDS (same offset).
1694 Info.flags = Flags | MachineMemOperand::MOStore;
1695 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1696 Infos.push_back(Info);
1697 return;
1698 }
1699 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1700 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1701 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1702 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1703 // Entry 0: Load from LDS.
1704 Info.opc = ISD::INTRINSIC_VOID;
1705 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1706 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1707 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1708 Info.flags = Flags | MachineMemOperand::MOLoad;
1709 Infos.push_back(Info);
1710
1711 // Entry 1: Store to global (same offset).
1712 Info.flags = Flags | MachineMemOperand::MOStore;
1713 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1714 Infos.push_back(Info);
1715 return;
1716 }
1717 case Intrinsic::amdgcn_load_to_lds:
1718 case Intrinsic::amdgcn_load_async_to_lds:
1719 case Intrinsic::amdgcn_global_load_lds:
1720 case Intrinsic::amdgcn_global_load_async_lds: {
1721 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1722 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1723 bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
1724 if (IsVolatile)
1726
1727 // Entry 0: Load from source (global/flat).
1728 Info.opc = ISD::INTRINSIC_VOID;
1729 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1730 Info.ptrVal = CI.getArgOperand(0); // Source pointer
1731 Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
1732 Info.flags = Flags | MachineMemOperand::MOLoad;
1733 Infos.push_back(Info);
1734
1735 // Entry 1: Store to LDS.
1736 // Same offset from the instruction, but an additional per-lane offset is
1737 // added. Represent that using a wider memory type.
1738 Info.memVT = EVT::getIntegerVT(CI.getContext(),
1739 Width * 8 * Subtarget->getWavefrontSize());
1740 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1741 Info.flags = Flags | MachineMemOperand::MOStore;
1742 Infos.push_back(Info);
1743 return;
1744 }
1745 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1746 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1747 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1748 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1749 Info.opc = ISD::INTRINSIC_W_CHAIN;
1750
1751 const GCNTargetMachine &TM =
1752 static_cast<const GCNTargetMachine &>(getTargetMachine());
1753
1755 Info.ptrVal = MFI->getGWSPSV(TM);
1756
1757 // This is an abstract access, but we need to specify a type and size.
1758 Info.memVT = MVT::i32;
1759 Info.size = 4;
1760 Info.align = Align(4);
1761
1763 Infos.push_back(Info);
1764 return;
1765 }
1766 case Intrinsic::amdgcn_s_prefetch_data:
1767 case Intrinsic::amdgcn_flat_prefetch:
1768 case Intrinsic::amdgcn_global_prefetch: {
1769 Info.opc = ISD::INTRINSIC_VOID;
1770 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1771 Info.ptrVal = CI.getArgOperand(0);
1772 Info.flags = Flags | MachineMemOperand::MOLoad;
1773 Infos.push_back(Info);
1774 return;
1775 }
1776 default:
1777 return;
1778 }
1779}
1780
1782 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1784 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1785 // The DAG's ValueType loses the addrspaces.
1786 // Add them as 2 extra Constant operands "from" and "to".
1787 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1788 unsigned DstAS = I.getType()->getPointerAddressSpace();
1789 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1790 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1791 break;
1792 }
1793 default:
1794 break;
1795 }
1796}
1797
1800 Type *&AccessTy) const {
1801 Value *Ptr = nullptr;
1802 switch (II->getIntrinsicID()) {
1803 case Intrinsic::amdgcn_cluster_load_b128:
1804 case Intrinsic::amdgcn_cluster_load_b64:
1805 case Intrinsic::amdgcn_cluster_load_b32:
1806 case Intrinsic::amdgcn_ds_append:
1807 case Intrinsic::amdgcn_ds_consume:
1808 case Intrinsic::amdgcn_ds_load_tr8_b64:
1809 case Intrinsic::amdgcn_ds_load_tr16_b128:
1810 case Intrinsic::amdgcn_ds_load_tr4_b64:
1811 case Intrinsic::amdgcn_ds_load_tr6_b96:
1812 case Intrinsic::amdgcn_ds_read_tr4_b64:
1813 case Intrinsic::amdgcn_ds_read_tr6_b96:
1814 case Intrinsic::amdgcn_ds_read_tr8_b64:
1815 case Intrinsic::amdgcn_ds_read_tr16_b64:
1816 case Intrinsic::amdgcn_ds_ordered_add:
1817 case Intrinsic::amdgcn_ds_ordered_swap:
1818 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1819 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1820 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1821 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1822 case Intrinsic::amdgcn_global_atomic_fmax_num:
1823 case Intrinsic::amdgcn_global_atomic_fmin_num:
1824 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1825 case Intrinsic::amdgcn_global_load_tr_b64:
1826 case Intrinsic::amdgcn_global_load_tr_b128:
1827 case Intrinsic::amdgcn_global_load_tr4_b64:
1828 case Intrinsic::amdgcn_global_load_tr6_b96:
1829 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1830 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1831 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1832 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1833 Ptr = II->getArgOperand(0);
1834 break;
1835 case Intrinsic::amdgcn_load_to_lds:
1836 case Intrinsic::amdgcn_load_async_to_lds:
1837 case Intrinsic::amdgcn_global_load_lds:
1838 case Intrinsic::amdgcn_global_load_async_lds:
1839 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1840 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1841 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1842 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1843 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1844 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1845 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1846 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1847 Ptr = II->getArgOperand(1);
1848 break;
1849 default:
1850 return false;
1851 }
1852 AccessTy = II->getType();
1853 Ops.push_back(Ptr);
1854 return true;
1855}
1856
1858 unsigned AddrSpace) const {
1859 if (!Subtarget->hasFlatInstOffsets()) {
1860 // Flat instructions do not have offsets, and only have the register
1861 // address.
1862 return AM.BaseOffs == 0 && AM.Scale == 0;
1863 }
1864
1865 decltype(SIInstrFlags::FLAT) FlatVariant =
1869
1870 return AM.Scale == 0 &&
1871 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1872 AM.BaseOffs, AddrSpace, FlatVariant));
1873}
1874
1876 if (Subtarget->hasFlatGlobalInsts())
1878
1879 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1880 // Assume the we will use FLAT for all global memory accesses
1881 // on VI.
1882 // FIXME: This assumption is currently wrong. On VI we still use
1883 // MUBUF instructions for the r + i addressing mode. As currently
1884 // implemented, the MUBUF instructions only work on buffer < 4GB.
1885 // It may be possible to support > 4GB buffers with MUBUF instructions,
1886 // by setting the stride value in the resource descriptor which would
1887 // increase the size limit to (stride * 4GB). However, this is risky,
1888 // because it has never been validated.
1890 }
1891
1892 return isLegalMUBUFAddressingMode(AM);
1893}
1894
1895bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1896 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1897 // additionally can do r + r + i with addr64. 32-bit has more addressing
1898 // mode options. Depending on the resource constant, it can also do
1899 // (i64 r0) + (i32 r1) * (i14 i).
1900 //
1901 // Private arrays end up using a scratch buffer most of the time, so also
1902 // assume those use MUBUF instructions. Scratch loads / stores are currently
1903 // implemented as mubuf instructions with offen bit set, so slightly
1904 // different than the normal addr64.
1905 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1906 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1907 return false;
1908
1909 // FIXME: Since we can split immediate into soffset and immediate offset,
1910 // would it make sense to allow any immediate?
1911
1912 switch (AM.Scale) {
1913 case 0: // r + i or just i, depending on HasBaseReg.
1914 return true;
1915 case 1:
1916 return true; // We have r + r or r + i.
1917 case 2:
1918 if (AM.HasBaseReg) {
1919 // Reject 2 * r + r.
1920 return false;
1921 }
1922
1923 // Allow 2 * r as r + r
1924 // Or 2 * r + i is allowed as r + r + i.
1925 return true;
1926 default: // Don't allow n * r
1927 return false;
1928 }
1929}
1930
1932 const AddrMode &AM, Type *Ty,
1933 unsigned AS,
1934 Instruction *I) const {
1935 // No global is ever allowed as a base.
1936 if (AM.BaseGV)
1937 return false;
1938
1939 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1940 return isLegalGlobalAddressingMode(AM);
1941
1942 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1946 // If the offset isn't a multiple of 4, it probably isn't going to be
1947 // correctly aligned.
1948 // FIXME: Can we get the real alignment here?
1949 if (AM.BaseOffs % 4 != 0)
1950 return isLegalMUBUFAddressingMode(AM);
1951
1952 if (!Subtarget->hasScalarSubwordLoads()) {
1953 // There are no SMRD extloads, so if we have to do a small type access we
1954 // will use a MUBUF load.
1955 // FIXME?: We also need to do this if unaligned, but we don't know the
1956 // alignment here.
1957 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1958 return isLegalGlobalAddressingMode(AM);
1959 }
1960
1961 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1962 // SMRD instructions have an 8-bit, dword offset on SI.
1963 if (!isUInt<8>(AM.BaseOffs / 4))
1964 return false;
1965 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1966 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1967 // in 8-bits, it can use a smaller encoding.
1968 if (!isUInt<32>(AM.BaseOffs / 4))
1969 return false;
1970 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1971 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1972 if (!isUInt<20>(AM.BaseOffs))
1973 return false;
1974 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1975 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1976 // for S_BUFFER_* instructions).
1977 if (!isInt<21>(AM.BaseOffs))
1978 return false;
1979 } else {
1980 // On GFX12, all offsets are signed 24-bit in bytes.
1981 if (!isInt<24>(AM.BaseOffs))
1982 return false;
1983 }
1984
1985 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1987 AM.BaseOffs < 0) {
1988 // Scalar (non-buffer) loads can only use a negative offset if
1989 // soffset+offset is non-negative. Since the compiler can only prove that
1990 // in a few special cases, it is safer to claim that negative offsets are
1991 // not supported.
1992 return false;
1993 }
1994
1995 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1996 return true;
1997
1998 if (AM.Scale == 1 && AM.HasBaseReg)
1999 return true;
2000
2001 return false;
2002 }
2003
2004 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
2005 return Subtarget->hasFlatScratchEnabled()
2007 : isLegalMUBUFAddressingMode(AM);
2008
2009 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
2010 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
2011 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
2012 // field.
2013 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
2014 // an 8-bit dword offset but we don't know the alignment here.
2015 if (!isUInt<16>(AM.BaseOffs))
2016 return false;
2017
2018 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2019 return true;
2020
2021 if (AM.Scale == 1 && AM.HasBaseReg)
2022 return true;
2023
2024 return false;
2025 }
2026
2028 // For an unknown address space, this usually means that this is for some
2029 // reason being used for pure arithmetic, and not based on some addressing
2030 // computation. We don't have instructions that compute pointers with any
2031 // addressing modes, so treat them as having no offset like flat
2032 // instructions.
2034 }
2035
2036 // Assume a user alias of global for unknown address spaces.
2037 return isLegalGlobalAddressingMode(AM);
2038}
2039
2041 const MachineFunction &MF) const {
2043 return (MemVT.getSizeInBits() <= 4 * 32);
2044 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
2045 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
2046 return (MemVT.getSizeInBits() <= MaxPrivateBits);
2047 }
2049 return (MemVT.getSizeInBits() <= 2 * 32);
2050 return true;
2051}
2052
2054 unsigned Size, unsigned AddrSpace, Align Alignment,
2055 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
2056 if (IsFast)
2057 *IsFast = 0;
2058
2059 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
2060 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2061 // Check if alignment requirements for ds_read/write instructions are
2062 // disabled.
2063 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
2064 return false;
2065
2066 Align RequiredAlignment(
2067 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
2068 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
2069 Alignment < RequiredAlignment)
2070 return false;
2071
2072 // Either, the alignment requirements are "enabled", or there is an
2073 // unaligned LDS access related hardware bug though alignment requirements
2074 // are "disabled". In either case, we need to check for proper alignment
2075 // requirements.
2076 //
2077 switch (Size) {
2078 case 64:
2079 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2080 // address is negative, then the instruction is incorrectly treated as
2081 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2082 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2083 // load later in the SILoadStoreOptimizer.
2084 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2085 return false;
2086
2087 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2088 // can do a 4 byte aligned, 8 byte access in a single operation using
2089 // ds_read2/write2_b32 with adjacent offsets.
2090 RequiredAlignment = Align(4);
2091
2092 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2093 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2094 // ds_write2_b32 depending on the alignment. In either case with either
2095 // alignment there is no faster way of doing this.
2096
2097 // The numbers returned here and below are not additive, it is a 'speed
2098 // rank'. They are just meant to be compared to decide if a certain way
2099 // of lowering an operation is faster than another. For that purpose
2100 // naturally aligned operation gets it bitsize to indicate that "it
2101 // operates with a speed comparable to N-bit wide load". With the full
2102 // alignment ds128 is slower than ds96 for example. If underaligned it
2103 // is comparable to a speed of a single dword access, which would then
2104 // mean 32 < 128 and it is faster to issue a wide load regardless.
2105 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2106 // wider load which will not be aligned anymore the latter is slower.
2107 if (IsFast)
2108 *IsFast = (Alignment >= RequiredAlignment) ? 64
2109 : (Alignment < Align(4)) ? 32
2110 : 1;
2111 return true;
2112 }
2113
2114 break;
2115 case 96:
2116 if (!Subtarget->hasDS96AndDS128())
2117 return false;
2118
2119 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2120 // gfx8 and older.
2121
2122 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2123 // Naturally aligned access is fastest. However, also report it is Fast
2124 // if memory is aligned less than DWORD. A narrow load or store will be
2125 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2126 // be more of them, so overall we will pay less penalty issuing a single
2127 // instruction.
2128
2129 // See comment on the values above.
2130 if (IsFast)
2131 *IsFast = (Alignment >= RequiredAlignment) ? 96
2132 : (Alignment < Align(4)) ? 32
2133 : 1;
2134 return true;
2135 }
2136
2137 break;
2138 case 128:
2139 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2140 return false;
2141
2142 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2143 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2144 // single operation using ds_read2/write2_b64.
2145 RequiredAlignment = Align(8);
2146
2147 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2148 // Naturally aligned access is fastest. However, also report it is Fast
2149 // if memory is aligned less than DWORD. A narrow load or store will be
2150 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2151 // will be more of them, so overall we will pay less penalty issuing a
2152 // single instruction.
2153
2154 // See comment on the values above.
2155 if (IsFast)
2156 *IsFast = (Alignment >= RequiredAlignment) ? 128
2157 : (Alignment < Align(4)) ? 32
2158 : 1;
2159 return true;
2160 }
2161
2162 break;
2163 default:
2164 if (Size > 32)
2165 return false;
2166
2167 break;
2168 }
2169
2170 // See comment on the values above.
2171 // Note that we have a single-dword or sub-dword here, so if underaligned
2172 // it is a slowest possible access, hence returned value is 0.
2173 if (IsFast)
2174 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2175
2176 return Alignment >= RequiredAlignment ||
2177 Subtarget->hasUnalignedDSAccessEnabled();
2178 }
2179
2180 // FIXME: We have to be conservative here and assume that flat operations
2181 // will access scratch. If we had access to the IR function, then we
2182 // could determine if any private memory was used in the function.
2183 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2184 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2185 bool AlignedBy4 = Alignment >= Align(4);
2186 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2187 if (IsFast)
2188 *IsFast = AlignedBy4 ? Size : 1;
2189 return true;
2190 }
2191
2192 if (IsFast)
2193 *IsFast = AlignedBy4;
2194
2195 return AlignedBy4;
2196 }
2197
2198 // So long as they are correct, wide global memory operations perform better
2199 // than multiple smaller memory ops -- even when misaligned
2200 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2201 if (IsFast)
2202 *IsFast = Size;
2203
2204 return Alignment >= Align(4) ||
2205 Subtarget->hasUnalignedBufferAccessEnabled();
2206 }
2207
2208 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2209 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2210 // out-of-bounds behavior, but in the edge case where an access starts
2211 // out-of-bounds and then enter in-bounds, the entire access would be treated
2212 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2213 // natural alignment of buffer accesses.
2214 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2215 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2216 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2217 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2218 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2219 return false;
2220 }
2221
2222 // Smaller than dword value must be aligned.
2223 if (Size < 32)
2224 return false;
2225
2226 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2227 // byte-address are ignored, thus forcing Dword alignment.
2228 // This applies to private, global, and constant memory.
2229 if (IsFast)
2230 *IsFast = 1;
2231
2232 return Size >= 32 && Alignment >= Align(4);
2233}
2234
2236 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2237 unsigned *IsFast) const {
2239 Alignment, Flags, IsFast);
2240}
2241
2243 LLVMContext &Context, const MemOp &Op,
2244 const AttributeList &FuncAttributes) const {
2245 // FIXME: Should account for address space here.
2246
2247 // The default fallback uses the private pointer size as a guess for a type to
2248 // use. Make sure we switch these to 64-bit accesses.
2249
2250 if (Op.size() >= 16 &&
2251 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2252 return MVT::v4i32;
2253
2254 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2255 return MVT::v2i32;
2256
2257 // Use the default.
2258 return MVT::Other;
2259}
2260
2262 const MemSDNode *MemNode = cast<MemSDNode>(N);
2263 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2264}
2265
2270
2272 unsigned DestAS) const {
2273 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2274 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2275 Subtarget->hasGloballyAddressableScratch()) {
2276 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2277 return false;
2278 }
2279
2280 // Flat -> private/local is a simple truncate.
2281 // Flat -> global is no-op
2282 return true;
2283 }
2284
2285 const GCNTargetMachine &TM =
2286 static_cast<const GCNTargetMachine &>(getTargetMachine());
2287 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2288}
2289
2297
2299 Type *Ty) const {
2300 // FIXME: Could be smarter if called for vector constants.
2301 return true;
2302}
2303
2305 unsigned Index) const {
2307 return false;
2308
2309 // TODO: Add more cases that are cheap.
2310 return Index == 0;
2311}
2312
2313bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2314 // TODO: This should be more aggressive, particular for 16-bit element
2315 // vectors. However there are some mixed improvements and regressions.
2316 EVT EltTy = VT.getVectorElementType();
2317 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2318 return EltTy.getSizeInBits() % MinAlign == 0;
2319}
2320
2322 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2323 switch (Op) {
2324 case ISD::LOAD:
2325 case ISD::STORE:
2326 return true;
2327 default:
2328 return false;
2329 }
2330 }
2331
2332 // SimplifySetCC uses this function to determine whether or not it should
2333 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2334 if (VT == MVT::i1 && Op == ISD::SETCC)
2335 return false;
2336
2338}
2339
2342 // This isn't really a constant pool but close enough.
2345 return PtrInfo;
2346}
2347
2348SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2349 const SDLoc &SL,
2350 SDValue Chain,
2351 uint64_t Offset) const {
2352 const DataLayout &DL = DAG.getDataLayout();
2356
2357 auto [InputPtrReg, RC, ArgTy] =
2358 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2359
2360 // We may not have the kernarg segment argument if we have no kernel
2361 // arguments.
2362 if (!InputPtrReg)
2363 return DAG.getConstant(Offset, SL, PtrVT);
2364
2366 SDValue BasePtr = DAG.getCopyFromReg(
2367 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2368
2369 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2370}
2371
2372SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2373 const SDLoc &SL) const {
2376 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2377}
2378
2379SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2380 const SDLoc &SL) const {
2381
2383 std::optional<uint32_t> KnownSize =
2385 if (KnownSize.has_value())
2386 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2387 return SDValue();
2388}
2389
2390SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2391 const SDLoc &SL, SDValue Val,
2392 bool Signed,
2393 const ISD::InputArg *Arg) const {
2394 // First, if it is a widened vector, narrow it.
2395 if (VT.isVector() &&
2397 EVT NarrowedVT =
2400 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2401 DAG.getConstant(0, SL, MVT::i32));
2402 }
2403
2404 // Then convert the vector elements or scalar value.
2405 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2406 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2407 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2408 }
2409
2410 if (MemVT.isFloatingPoint()) {
2411 if (VT.isFloatingPoint()) {
2412 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2413 } else {
2414 assert(!MemVT.isVector());
2415 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
2416 SDValue Cast = DAG.getBitcast(IntVT, Val);
2417 Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);
2418 }
2419 } else if (Signed)
2420 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2421 else
2422 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2423
2424 return Val;
2425}
2426
2427SDValue SITargetLowering::lowerKernargMemParameter(
2428 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2429 uint64_t Offset, Align Alignment, bool Signed,
2430 const ISD::InputArg *Arg) const {
2431
2432 MachinePointerInfo PtrInfo =
2434
2435 // Try to avoid using an extload by loading earlier than the argument address,
2436 // and extracting the relevant bits. The load should hopefully be merged with
2437 // the previous argument.
2438 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2439 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2440 int64_t AlignDownOffset = alignDown(Offset, 4);
2441 int64_t OffsetDiff = Offset - AlignDownOffset;
2442
2443 EVT IntVT = MemVT.changeTypeToInteger();
2444
2445 // TODO: If we passed in the base kernel offset we could have a better
2446 // alignment than 4, but we don't really need it.
2447 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2448 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2449 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2452
2453 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2454 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2455
2456 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2457 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2458 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2459
2460 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2461 }
2462
2463 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2464 SDValue Load = DAG.getLoad(
2465 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2467
2468 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2469 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2470}
2471
2472/// Coerce an argument which was passed in a different ABI type to the original
2473/// expected value type.
2474SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2475 SDValue Val,
2476 CCValAssign &VA,
2477 const SDLoc &SL) const {
2478 EVT ValVT = VA.getValVT();
2479
2480 // If this is an 8 or 16-bit value, it is really passed promoted
2481 // to 32 bits. Insert an assert[sz]ext to capture this, then
2482 // truncate to the right size.
2483 switch (VA.getLocInfo()) {
2484 case CCValAssign::Full:
2485 return Val;
2486 case CCValAssign::BCvt:
2487 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2488 case CCValAssign::SExt:
2489 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2490 DAG.getValueType(ValVT));
2491 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2492 case CCValAssign::ZExt:
2493 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2494 DAG.getValueType(ValVT));
2495 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2496 case CCValAssign::AExt:
2497 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2498 default:
2499 llvm_unreachable("Unknown loc info!");
2500 }
2501}
2502
2503SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2504 CCValAssign &VA, const SDLoc &SL,
2505 SDValue Chain,
2506 const ISD::InputArg &Arg) const {
2507 MachineFunction &MF = DAG.getMachineFunction();
2508 MachineFrameInfo &MFI = MF.getFrameInfo();
2509
2510 if (Arg.Flags.isByVal()) {
2511 unsigned Size = Arg.Flags.getByValSize();
2512 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2513 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2514 }
2515
2516 unsigned ArgOffset = VA.getLocMemOffset();
2517 unsigned ArgSize = VA.getValVT().getStoreSize();
2518
2519 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2520
2521 // Create load nodes to retrieve arguments from the stack.
2522 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2523
2524 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2526 MVT MemVT = VA.getValVT();
2527
2528 switch (VA.getLocInfo()) {
2529 default:
2530 break;
2531 case CCValAssign::BCvt:
2532 MemVT = VA.getLocVT();
2533 break;
2534 case CCValAssign::SExt:
2535 ExtType = ISD::SEXTLOAD;
2536 break;
2537 case CCValAssign::ZExt:
2538 ExtType = ISD::ZEXTLOAD;
2539 break;
2540 case CCValAssign::AExt:
2541 ExtType = ISD::EXTLOAD;
2542 break;
2543 }
2544
2545 SDValue ArgValue = DAG.getExtLoad(
2546 ExtType, SL, VA.getLocVT(), Chain, FIN,
2548
2549 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2550 if (ConvertedVal == ArgValue)
2551 return ConvertedVal;
2552
2553 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2554}
2555
2556SDValue SITargetLowering::lowerWorkGroupId(
2557 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2560 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2561 if (!Subtarget->hasClusters())
2562 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2563
2564 // Clusters are supported. Return the global position in the grid. If clusters
2565 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2566
2567 // WorkGroupIdXYZ = ClusterId == 0 ?
2568 // ClusterIdXYZ :
2569 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2570 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2571 SDLoc SL(ClusterIdXYZ);
2572 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2573 SDValue One = DAG.getConstant(1, SL, VT);
2574 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2575 SDValue ClusterWorkGroupIdXYZ =
2576 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2577 SDValue GlobalIdXYZ =
2578 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2579 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2580
2581 switch (MFI.getClusterDims().getKind()) {
2584 return GlobalIdXYZ;
2586 return ClusterIdXYZ;
2588 using namespace AMDGPU::Hwreg;
2589 SDValue ClusterIdField =
2590 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2591 SDNode *GetReg =
2592 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2593 SDValue ClusterId(GetReg, 0);
2594 SDValue Zero = DAG.getConstant(0, SL, VT);
2595 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2596 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2597 }
2598 }
2599
2600 llvm_unreachable("nothing should reach here");
2601}
2602
2603SDValue SITargetLowering::getPreloadedValue(
2604 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2606 const ArgDescriptor *Reg = nullptr;
2607 const TargetRegisterClass *RC;
2608 LLT Ty;
2609
2611 const ArgDescriptor WorkGroupIDX =
2612 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2613 // If GridZ is not programmed in an entry function then the hardware will set
2614 // it to all zeros, so there is no need to mask the GridY value in the low
2615 // order bits.
2616 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2617 AMDGPU::TTMP7,
2618 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2619 const ArgDescriptor WorkGroupIDZ =
2620 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2621 const ArgDescriptor ClusterWorkGroupIDX =
2622 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2623 const ArgDescriptor ClusterWorkGroupIDY =
2624 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2625 const ArgDescriptor ClusterWorkGroupIDZ =
2626 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2627 const ArgDescriptor ClusterWorkGroupMaxIDX =
2628 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2629 const ArgDescriptor ClusterWorkGroupMaxIDY =
2630 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2631 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2632 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2633 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2634 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2635
2636 auto LoadConstant = [&](unsigned N) {
2637 return DAG.getConstant(N, SDLoc(), VT);
2638 };
2639
2640 if (Subtarget->hasArchitectedSGPRs() &&
2642 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2643 bool HasFixedDims = ClusterDims.isFixedDims();
2644
2645 switch (PVID) {
2647 Reg = &WorkGroupIDX;
2648 RC = &AMDGPU::SReg_32RegClass;
2649 Ty = LLT::scalar(32);
2650 break;
2652 Reg = &WorkGroupIDY;
2653 RC = &AMDGPU::SReg_32RegClass;
2654 Ty = LLT::scalar(32);
2655 break;
2657 Reg = &WorkGroupIDZ;
2658 RC = &AMDGPU::SReg_32RegClass;
2659 Ty = LLT::scalar(32);
2660 break;
2662 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2663 return LoadConstant(0);
2664 Reg = &ClusterWorkGroupIDX;
2665 RC = &AMDGPU::SReg_32RegClass;
2666 Ty = LLT::scalar(32);
2667 break;
2669 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2670 return LoadConstant(0);
2671 Reg = &ClusterWorkGroupIDY;
2672 RC = &AMDGPU::SReg_32RegClass;
2673 Ty = LLT::scalar(32);
2674 break;
2676 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2677 return LoadConstant(0);
2678 Reg = &ClusterWorkGroupIDZ;
2679 RC = &AMDGPU::SReg_32RegClass;
2680 Ty = LLT::scalar(32);
2681 break;
2683 if (HasFixedDims)
2684 return LoadConstant(ClusterDims.getDims()[0] - 1);
2685 Reg = &ClusterWorkGroupMaxIDX;
2686 RC = &AMDGPU::SReg_32RegClass;
2687 Ty = LLT::scalar(32);
2688 break;
2690 if (HasFixedDims)
2691 return LoadConstant(ClusterDims.getDims()[1] - 1);
2692 Reg = &ClusterWorkGroupMaxIDY;
2693 RC = &AMDGPU::SReg_32RegClass;
2694 Ty = LLT::scalar(32);
2695 break;
2697 if (HasFixedDims)
2698 return LoadConstant(ClusterDims.getDims()[2] - 1);
2699 Reg = &ClusterWorkGroupMaxIDZ;
2700 RC = &AMDGPU::SReg_32RegClass;
2701 Ty = LLT::scalar(32);
2702 break;
2704 Reg = &ClusterWorkGroupMaxFlatID;
2705 RC = &AMDGPU::SReg_32RegClass;
2706 Ty = LLT::scalar(32);
2707 break;
2708 default:
2709 break;
2710 }
2711 }
2712
2713 if (!Reg)
2714 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2715 if (!Reg) {
2717 // It's possible for a kernarg intrinsic call to appear in a kernel with
2718 // no allocated segment, in which case we do not add the user sgpr
2719 // argument, so just return null.
2720 return DAG.getConstant(0, SDLoc(), VT);
2721 }
2722
2723 // It's undefined behavior if a function marked with the amdgpu-no-*
2724 // attributes uses the corresponding intrinsic.
2725 return DAG.getPOISON(VT);
2726 }
2727
2728 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2729}
2730
2732 CallingConv::ID CallConv,
2733 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2734 FunctionType *FType,
2735 SIMachineFunctionInfo *Info) {
2736 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2737 const ISD::InputArg *Arg = &Ins[I];
2738
2739 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2740 "vector type argument should have been split");
2741
2742 // First check if it's a PS input addr.
2743 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2744 PSInputNum <= 15) {
2745 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2746
2747 // Inconveniently only the first part of the split is marked as isSplit,
2748 // so skip to the end. We only want to increment PSInputNum once for the
2749 // entire split argument.
2750 if (Arg->Flags.isSplit()) {
2751 while (!Arg->Flags.isSplitEnd()) {
2752 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2753 "unexpected vector split in ps argument type");
2754 if (!SkipArg)
2755 Splits.push_back(*Arg);
2756 Arg = &Ins[++I];
2757 }
2758 }
2759
2760 if (SkipArg) {
2761 // We can safely skip PS inputs.
2762 Skipped.set(Arg->getOrigArgIndex());
2763 ++PSInputNum;
2764 continue;
2765 }
2766
2767 Info->markPSInputAllocated(PSInputNum);
2768 if (Arg->Used)
2769 Info->markPSInputEnabled(PSInputNum);
2770
2771 ++PSInputNum;
2772 }
2773
2774 Splits.push_back(*Arg);
2775 }
2776}
2777
2778// Allocate special inputs passed in VGPRs.
2780 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2781 SIMachineFunctionInfo &Info) const {
2782 const LLT S32 = LLT::scalar(32);
2784
2785 if (Info.hasWorkItemIDX()) {
2786 Register Reg = AMDGPU::VGPR0;
2787 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2788
2789 CCInfo.AllocateReg(Reg);
2790 unsigned Mask =
2791 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2792 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2793 }
2794
2795 if (Info.hasWorkItemIDY()) {
2796 assert(Info.hasWorkItemIDX());
2797 if (Subtarget->hasPackedTID()) {
2798 Info.setWorkItemIDY(
2799 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2800 } else {
2801 unsigned Reg = AMDGPU::VGPR1;
2802 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2803
2804 CCInfo.AllocateReg(Reg);
2805 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2806 }
2807 }
2808
2809 if (Info.hasWorkItemIDZ()) {
2810 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2811 if (Subtarget->hasPackedTID()) {
2812 Info.setWorkItemIDZ(
2813 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2814 } else {
2815 unsigned Reg = AMDGPU::VGPR2;
2816 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2817
2818 CCInfo.AllocateReg(Reg);
2819 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2820 }
2821 }
2822}
2823
2824// Try to allocate a VGPR at the end of the argument list, or if no argument
2825// VGPRs are left allocating a stack slot.
2826// If \p Mask is is given it indicates bitfield position in the register.
2827// If \p Arg is given use it with new ]p Mask instead of allocating new.
2828static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2829 ArgDescriptor Arg = ArgDescriptor()) {
2830 if (Arg.isSet())
2831 return ArgDescriptor::createArg(Arg, Mask);
2832
2833 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2834 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2835 if (RegIdx == ArgVGPRs.size()) {
2836 // Spill to stack required.
2837 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2838
2839 return ArgDescriptor::createStack(Offset, Mask);
2840 }
2841
2842 unsigned Reg = ArgVGPRs[RegIdx];
2843 Reg = CCInfo.AllocateReg(Reg);
2844 assert(Reg != AMDGPU::NoRegister);
2845
2846 MachineFunction &MF = CCInfo.getMachineFunction();
2847 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2848 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2849 return ArgDescriptor::createRegister(Reg, Mask);
2850}
2851
2853 const TargetRegisterClass *RC,
2854 unsigned NumArgRegs) {
2855 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2856 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2857 if (RegIdx == ArgSGPRs.size())
2858 report_fatal_error("ran out of SGPRs for arguments");
2859
2860 unsigned Reg = ArgSGPRs[RegIdx];
2861 Reg = CCInfo.AllocateReg(Reg);
2862 assert(Reg != AMDGPU::NoRegister);
2863
2864 MachineFunction &MF = CCInfo.getMachineFunction();
2865 MF.addLiveIn(Reg, RC);
2867}
2868
2869// If this has a fixed position, we still should allocate the register in the
2870// CCInfo state. Technically we could get away with this for values passed
2871// outside of the normal argument range.
2873 const TargetRegisterClass *RC,
2874 MCRegister Reg) {
2875 Reg = CCInfo.AllocateReg(Reg);
2876 assert(Reg != AMDGPU::NoRegister);
2877 MachineFunction &MF = CCInfo.getMachineFunction();
2878 MF.addLiveIn(Reg, RC);
2879}
2880
2881static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2882 if (Arg) {
2883 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2884 Arg.getRegister());
2885 } else
2886 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2887}
2888
2889static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2890 if (Arg) {
2891 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2892 Arg.getRegister());
2893 } else
2894 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2895}
2896
2897/// Allocate implicit function VGPR arguments at the end of allocated user
2898/// arguments.
2900 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2901 SIMachineFunctionInfo &Info) const {
2902 const unsigned Mask = 0x3ff;
2903 ArgDescriptor Arg;
2904
2905 if (Info.hasWorkItemIDX()) {
2906 Arg = allocateVGPR32Input(CCInfo, Mask);
2907 Info.setWorkItemIDX(Arg);
2908 }
2909
2910 if (Info.hasWorkItemIDY()) {
2911 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2912 Info.setWorkItemIDY(Arg);
2913 }
2914
2915 if (Info.hasWorkItemIDZ())
2916 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2917}
2918
2919/// Allocate implicit function VGPR arguments in fixed registers.
2921 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2922 SIMachineFunctionInfo &Info) const {
2923 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2924 if (!Reg)
2925 report_fatal_error("failed to allocate VGPR for implicit arguments");
2926
2927 const unsigned Mask = 0x3ff;
2928 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2929 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2930 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2931}
2932
2934 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2935 SIMachineFunctionInfo &Info) const {
2936 auto &ArgInfo = Info.getArgInfo();
2937 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2938
2939 // TODO: Unify handling with private memory pointers.
2940 if (UserSGPRInfo.hasDispatchPtr())
2941 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2942
2943 if (UserSGPRInfo.hasQueuePtr())
2944 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2945
2946 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2947 // constant offset from the kernarg segment.
2948 if (Info.hasImplicitArgPtr())
2949 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2950
2951 if (UserSGPRInfo.hasDispatchID())
2952 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2953
2954 // flat_scratch_init is not applicable for non-kernel functions.
2955
2956 if (Info.hasWorkGroupIDX())
2957 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2958
2959 if (Info.hasWorkGroupIDY())
2960 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2961
2962 if (Info.hasWorkGroupIDZ())
2963 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2964
2965 if (Info.hasLDSKernelId())
2966 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2967}
2968
2969// Allocate special inputs passed in user SGPRs.
2971 MachineFunction &MF,
2972 const SIRegisterInfo &TRI,
2973 SIMachineFunctionInfo &Info) const {
2974 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2975 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2976 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2977 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2978 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2979 }
2980
2981 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2982 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2983 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2984 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2985 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2986 }
2987
2988 if (UserSGPRInfo.hasDispatchPtr()) {
2989 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2990 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2991 CCInfo.AllocateReg(DispatchPtrReg);
2992 }
2993
2994 if (UserSGPRInfo.hasQueuePtr()) {
2995 Register QueuePtrReg = Info.addQueuePtr(TRI);
2996 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2997 CCInfo.AllocateReg(QueuePtrReg);
2998 }
2999
3000 if (UserSGPRInfo.hasKernargSegmentPtr()) {
3002 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
3003 CCInfo.AllocateReg(InputPtrReg);
3004
3005 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
3006 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
3007 }
3008
3009 if (UserSGPRInfo.hasDispatchID()) {
3010 Register DispatchIDReg = Info.addDispatchID(TRI);
3011 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3012 CCInfo.AllocateReg(DispatchIDReg);
3013 }
3014
3015 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
3016 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
3017 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3018 CCInfo.AllocateReg(FlatScratchInitReg);
3019 }
3020
3021 if (UserSGPRInfo.hasPrivateSegmentSize()) {
3022 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
3023 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3024 CCInfo.AllocateReg(PrivateSegmentSizeReg);
3025 }
3026
3027 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
3028 // these from the dispatch pointer.
3029}
3030
3031// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
3032// sequential starting from the first argument.
3034 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
3036 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
3037 Function &F = MF.getFunction();
3038 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3039 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
3040 bool InPreloadSequence = true;
3041 unsigned InIdx = 0;
3042 bool AlignedForImplictArgs = false;
3043 unsigned ImplicitArgOffset = 0;
3044 for (auto &Arg : F.args()) {
3045 if (!InPreloadSequence || !Arg.hasInRegAttr())
3046 break;
3047
3048 unsigned ArgIdx = Arg.getArgNo();
3049 // Don't preload non-original args or parts not in the current preload
3050 // sequence.
3051 if (InIdx < Ins.size() &&
3052 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3053 break;
3054
3055 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
3056 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3057 InIdx++) {
3058 assert(ArgLocs[ArgIdx].isMemLoc());
3059 auto &ArgLoc = ArgLocs[InIdx];
3060 const Align KernelArgBaseAlign = Align(16);
3061 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3062 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
3063 unsigned NumAllocSGPRs =
3064 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3065
3066 // Fix alignment for hidden arguments.
3067 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
3068 if (!AlignedForImplictArgs) {
3069 ImplicitArgOffset =
3070 alignTo(LastExplicitArgOffset,
3071 Subtarget->getAlignmentForImplicitArgPtr()) -
3072 LastExplicitArgOffset;
3073 AlignedForImplictArgs = true;
3074 }
3075 ArgOffset += ImplicitArgOffset;
3076 }
3077
3078 // Arg is preloaded into the previous SGPR.
3079 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3080 assert(InIdx >= 1 && "No previous SGPR");
3081 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3082 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3083 continue;
3084 }
3085
3086 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3087 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
3088 // Check for free user SGPRs for preloading.
3089 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3090 InPreloadSequence = false;
3091 break;
3092 }
3093
3094 // Preload this argument.
3095 const TargetRegisterClass *RC =
3096 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3097 SmallVectorImpl<MCRegister> *PreloadRegs =
3098 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3099
3100 if (PreloadRegs->size() > 1)
3101 RC = &AMDGPU::SGPR_32RegClass;
3102 for (auto &Reg : *PreloadRegs) {
3103 assert(Reg);
3104 MF.addLiveIn(Reg, RC);
3105 CCInfo.AllocateReg(Reg);
3106 }
3107
3108 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3109 }
3110 }
3111}
3112
3114 const SIRegisterInfo &TRI,
3115 SIMachineFunctionInfo &Info) const {
3116 // Always allocate this last since it is a synthetic preload.
3117 if (Info.hasLDSKernelId()) {
3118 Register Reg = Info.addLDSKernelId();
3119 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3120 CCInfo.AllocateReg(Reg);
3121 }
3122}
3123
3124// Allocate special input registers that are initialized per-wave.
3127 CallingConv::ID CallConv,
3128 bool IsShader) const {
3129 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3130 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3131 // Note: user SGPRs are handled by the front-end for graphics shaders
3132 // Pad up the used user SGPRs with dead inputs.
3133
3134 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3135 // before enabling architected SGPRs for workgroup IDs.
3136 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3137
3138 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3139 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3140 // rely on it to reach 16 since if we end up having no stack usage, it will
3141 // not really be added.
3142 unsigned NumRequiredSystemSGPRs =
3143 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3144 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3145 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3146 Register Reg = Info.addReservedUserSGPR();
3147 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3148 CCInfo.AllocateReg(Reg);
3149 }
3150 }
3151
3152 if (!HasArchitectedSGPRs) {
3153 if (Info.hasWorkGroupIDX()) {
3154 Register Reg = Info.addWorkGroupIDX();
3155 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3156 CCInfo.AllocateReg(Reg);
3157 }
3158
3159 if (Info.hasWorkGroupIDY()) {
3160 Register Reg = Info.addWorkGroupIDY();
3161 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3162 CCInfo.AllocateReg(Reg);
3163 }
3164
3165 if (Info.hasWorkGroupIDZ()) {
3166 Register Reg = Info.addWorkGroupIDZ();
3167 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3168 CCInfo.AllocateReg(Reg);
3169 }
3170 }
3171
3172 if (Info.hasWorkGroupInfo()) {
3173 Register Reg = Info.addWorkGroupInfo();
3174 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3175 CCInfo.AllocateReg(Reg);
3176 }
3177
3178 if (Info.hasPrivateSegmentWaveByteOffset()) {
3179 // Scratch wave offset passed in system SGPR.
3180 unsigned PrivateSegmentWaveByteOffsetReg;
3181
3182 if (IsShader) {
3183 PrivateSegmentWaveByteOffsetReg =
3184 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3185
3186 // This is true if the scratch wave byte offset doesn't have a fixed
3187 // location.
3188 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3189 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3190 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3191 }
3192 } else
3193 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3194
3195 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3196 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3197 }
3198
3199 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3200 Info.getNumPreloadedSGPRs() >= 16);
3201}
3202
3204 MachineFunction &MF,
3205 const SIRegisterInfo &TRI,
3206 SIMachineFunctionInfo &Info) {
3207 // Now that we've figured out where the scratch register inputs are, see if
3208 // should reserve the arguments and use them directly.
3209 MachineFrameInfo &MFI = MF.getFrameInfo();
3210 bool HasStackObjects = MFI.hasStackObjects();
3211 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3212
3213 // Record that we know we have non-spill stack objects so we don't need to
3214 // check all stack objects later.
3215 if (HasStackObjects)
3216 Info.setHasNonSpillStackObjects(true);
3217
3218 // Everything live out of a block is spilled with fast regalloc, so it's
3219 // almost certain that spilling will be required.
3221 HasStackObjects = true;
3222
3223 // For now assume stack access is needed in any callee functions, so we need
3224 // the scratch registers to pass in.
3225 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3226
3227 if (!ST.hasFlatScratchEnabled()) {
3228 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3229 // If we have stack objects, we unquestionably need the private buffer
3230 // resource. For the Code Object V2 ABI, this will be the first 4 user
3231 // SGPR inputs. We can reserve those and use them directly.
3232
3233 Register PrivateSegmentBufferReg =
3235 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3236 } else {
3237 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3238 // We tentatively reserve the last registers (skipping the last registers
3239 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3240 // we'll replace these with the ones immediately after those which were
3241 // really allocated. In the prologue copies will be inserted from the
3242 // argument to these reserved registers.
3243
3244 // Without HSA, relocations are used for the scratch pointer and the
3245 // buffer resource setup is always inserted in the prologue. Scratch wave
3246 // offset is still in an input SGPR.
3247 Info.setScratchRSrcReg(ReservedBufferReg);
3248 }
3249 }
3250
3252
3253 // For entry functions we have to set up the stack pointer if we use it,
3254 // whereas non-entry functions get this "for free". This means there is no
3255 // intrinsic advantage to using S32 over S34 in cases where we do not have
3256 // calls but do need a frame pointer (i.e. if we are requested to have one
3257 // because frame pointer elimination is disabled). To keep things simple we
3258 // only ever use S32 as the call ABI stack pointer, and so using it does not
3259 // imply we need a separate frame pointer.
3260 //
3261 // Try to use s32 as the SP, but move it if it would interfere with input
3262 // arguments. This won't work with calls though.
3263 //
3264 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3265 // registers.
3266 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3267 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3268 } else {
3270
3271 if (MFI.hasCalls())
3272 report_fatal_error("call in graphics shader with too many input SGPRs");
3273
3274 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3275 if (!MRI.isLiveIn(Reg)) {
3276 Info.setStackPtrOffsetReg(Reg);
3277 break;
3278 }
3279 }
3280
3281 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3282 report_fatal_error("failed to find register for SP");
3283 }
3284
3285 // hasFP should be accurate for entry functions even before the frame is
3286 // finalized, because it does not rely on the known stack size, only
3287 // properties like whether variable sized objects are present.
3288 if (ST.getFrameLowering()->hasFP(MF)) {
3289 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3290 }
3291}
3292
3295 return !Info->isEntryFunction();
3296}
3297
3299
3301 MachineBasicBlock *Entry,
3302 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3304
3305 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3306 if (!IStart)
3307 return;
3308
3309 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3310 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3311 MachineBasicBlock::iterator MBBI = Entry->begin();
3312 for (const MCPhysReg *I = IStart; *I; ++I) {
3313 const TargetRegisterClass *RC = nullptr;
3314 if (AMDGPU::SReg_64RegClass.contains(*I))
3315 RC = &AMDGPU::SGPR_64RegClass;
3316 else if (AMDGPU::SReg_32RegClass.contains(*I))
3317 RC = &AMDGPU::SGPR_32RegClass;
3318 else
3319 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3320
3321 Register NewVR = MRI->createVirtualRegister(RC);
3322 // Create copy from CSR to a virtual register.
3323 Entry->addLiveIn(*I);
3324 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3325 .addReg(*I);
3326
3327 // Insert the copy-back instructions right before the terminator.
3328 for (auto *Exit : Exits)
3329 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3330 TII->get(TargetOpcode::COPY), *I)
3331 .addReg(NewVR);
3332 }
3333}
3334
3336 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3337 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3338 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3340
3342 const Function &Fn = MF.getFunction();
3345 bool IsError = false;
3346
3347 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3349 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3350 IsError = true;
3351 }
3352
3355 BitVector Skipped(Ins.size());
3356 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3357 *DAG.getContext());
3358
3359 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3360 bool IsKernel = AMDGPU::isKernel(CallConv);
3361 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3362
3363 if (IsGraphics) {
3364 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3365 assert(!UserSGPRInfo.hasDispatchPtr() &&
3366 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3367 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3368 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3369 (void)UserSGPRInfo;
3370 if (!Subtarget->hasFlatScratchEnabled())
3371 assert(!UserSGPRInfo.hasFlatScratchInit());
3372 if ((CallConv != CallingConv::AMDGPU_CS &&
3373 CallConv != CallingConv::AMDGPU_Gfx &&
3374 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3375 !Subtarget->hasArchitectedSGPRs())
3376 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3377 !Info->hasWorkGroupIDZ());
3378 }
3379
3380 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3381
3382 if (CallConv == CallingConv::AMDGPU_PS) {
3383 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3384
3385 // At least one interpolation mode must be enabled or else the GPU will
3386 // hang.
3387 //
3388 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3389 // set PSInputAddr, the user wants to enable some bits after the compilation
3390 // based on run-time states. Since we can't know what the final PSInputEna
3391 // will look like, so we shouldn't do anything here and the user should take
3392 // responsibility for the correct programming.
3393 //
3394 // Otherwise, the following restrictions apply:
3395 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3396 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3397 // enabled too.
3398 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3399 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3400 CCInfo.AllocateReg(AMDGPU::VGPR0);
3401 CCInfo.AllocateReg(AMDGPU::VGPR1);
3402 Info->markPSInputAllocated(0);
3403 Info->markPSInputEnabled(0);
3404 }
3405 if (Subtarget->isAmdPalOS()) {
3406 // For isAmdPalOS, the user does not enable some bits after compilation
3407 // based on run-time states; the register values being generated here are
3408 // the final ones set in hardware. Therefore we need to apply the
3409 // workaround to PSInputAddr and PSInputEnable together. (The case where
3410 // a bit is set in PSInputAddr but not PSInputEnable is where the
3411 // frontend set up an input arg for a particular interpolation mode, but
3412 // nothing uses that input arg. Really we should have an earlier pass
3413 // that removes such an arg.)
3414 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3415 if ((PsInputBits & 0x7F) == 0 ||
3416 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3417 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3418 }
3419 } else if (IsKernel) {
3420 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3421 } else {
3422 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3423 Ins.end());
3424 }
3425
3426 if (IsKernel)
3427 analyzeFormalArgumentsCompute(CCInfo, Ins);
3428
3429 if (IsEntryFunc) {
3430 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3431 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3432 if (IsKernel && Subtarget->hasKernargPreload())
3433 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3434
3435 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3436 } else if (!IsGraphics) {
3437 // For the fixed ABI, pass workitem IDs in the last argument register.
3438 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3439
3440 // FIXME: Sink this into allocateSpecialInputSGPRs
3441 if (!Subtarget->hasFlatScratchEnabled())
3442 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3443
3444 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3445 }
3446
3447 if (!IsKernel) {
3448 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3449 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3450
3451 // This assumes the registers are allocated by CCInfo in ascending order
3452 // with no gaps.
3453 Info->setNumWaveDispatchSGPRs(
3454 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3455 Info->setNumWaveDispatchVGPRs(
3456 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3457 } else if (Info->getNumKernargPreloadedSGPRs()) {
3458 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3459 }
3460
3462
3463 if (IsWholeWaveFunc) {
3464 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3465 {MVT::i1, MVT::Other}, Chain);
3466 InVals.push_back(Setup.getValue(0));
3467 Chains.push_back(Setup.getValue(1));
3468 }
3469
3470 // FIXME: This is the minimum kernel argument alignment. We should improve
3471 // this to the maximum alignment of the arguments.
3472 //
3473 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3474 // kern arg offset.
3475 const Align KernelArgBaseAlign = Align(16);
3476
3477 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3478 ++i) {
3479 const ISD::InputArg &Arg = Ins[i];
3480 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3481 InVals.push_back(DAG.getPOISON(Arg.VT));
3482 continue;
3483 }
3484
3485 CCValAssign &VA = ArgLocs[ArgIdx++];
3486 MVT VT = VA.getLocVT();
3487
3488 if (IsEntryFunc && VA.isMemLoc()) {
3489 VT = Ins[i].VT;
3490 EVT MemVT = VA.getLocVT();
3491
3492 const uint64_t Offset = VA.getLocMemOffset();
3493 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3494
3495 if (Arg.Flags.isByRef()) {
3496 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3497
3498 const GCNTargetMachine &TM =
3499 static_cast<const GCNTargetMachine &>(getTargetMachine());
3500 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3501 Arg.Flags.getPointerAddrSpace())) {
3504 }
3505
3506 InVals.push_back(Ptr);
3507 continue;
3508 }
3509
3510 SDValue NewArg;
3511 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3512 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3513 // In this case the argument is packed into the previous preload SGPR.
3514 int64_t AlignDownOffset = alignDown(Offset, 4);
3515 int64_t OffsetDiff = Offset - AlignDownOffset;
3516 EVT IntVT = MemVT.changeTypeToInteger();
3517
3518 const SIMachineFunctionInfo *Info =
3521 Register Reg =
3522 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3523
3524 assert(Reg);
3525 Register VReg = MRI.getLiveInVirtReg(Reg);
3526 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3527
3528 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3529 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3530
3531 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3532 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3533 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3534 Ins[i].Flags.isSExt(), &Ins[i]);
3535
3536 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3537 } else {
3538 const SIMachineFunctionInfo *Info =
3541 const SmallVectorImpl<MCRegister> &PreloadRegs =
3542 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3543
3544 SDValue Copy;
3545 if (PreloadRegs.size() == 1) {
3546 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3547 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3548 NewArg = DAG.getCopyFromReg(
3549 Chain, DL, VReg,
3551 TRI->getRegSizeInBits(*RC)));
3552
3553 } else {
3554 // If the kernarg alignment does not match the alignment of the SGPR
3555 // tuple RC that can accommodate this argument, it will be built up
3556 // via copies from from the individual SGPRs that the argument was
3557 // preloaded to.
3559 for (auto Reg : PreloadRegs) {
3560 Register VReg = MRI.getLiveInVirtReg(Reg);
3561 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3562 Elts.push_back(Copy);
3563 }
3564 NewArg =
3565 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3566 PreloadRegs.size()),
3567 DL, Elts);
3568 }
3569
3570 // If the argument was preloaded to multiple consecutive 32-bit
3571 // registers because of misalignment between addressable SGPR tuples
3572 // and the argument size, we can still assume that because of kernarg
3573 // segment alignment restrictions that NewArg's size is the same as
3574 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3575 // truncate since we cannot preload to less than a single SGPR and the
3576 // MemVT may be smaller.
3577 EVT MemVTInt =
3579 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3580 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3581
3582 NewArg = DAG.getBitcast(MemVT, NewArg);
3583 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3584 Ins[i].Flags.isSExt(), &Ins[i]);
3585 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3586 }
3587 } else {
3588 // Hidden arguments that are in the kernel signature must be preloaded
3589 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3590 // the argument list and is not preloaded.
3591 if (Arg.isOrigArg()) {
3592 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3593 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3595 *OrigArg->getParent(),
3596 "hidden argument in kernel signature was not preloaded",
3597 DL.getDebugLoc()));
3598 }
3599 }
3600
3601 NewArg =
3602 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3603 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3604 }
3605 Chains.push_back(NewArg.getValue(1));
3606
3607 auto *ParamTy =
3608 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3609 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3610 ParamTy &&
3611 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3612 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3613 // On SI local pointers are just offsets into LDS, so they are always
3614 // less than 16-bits. On CI and newer they could potentially be
3615 // real pointers, so we can't guarantee their size.
3616 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3617 DAG.getValueType(MVT::i16));
3618 }
3619
3620 InVals.push_back(NewArg);
3621 continue;
3622 }
3623 if (!IsEntryFunc && VA.isMemLoc()) {
3624 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3625 InVals.push_back(Val);
3626 if (!Arg.Flags.isByVal())
3627 Chains.push_back(Val.getValue(1));
3628 continue;
3629 }
3630
3631 assert(VA.isRegLoc() && "Parameter must be in a register!");
3632
3633 Register Reg = VA.getLocReg();
3634 const TargetRegisterClass *RC = nullptr;
3635 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3636 RC = &AMDGPU::VGPR_32RegClass;
3637 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3638 RC = &AMDGPU::SGPR_32RegClass;
3639 else
3640 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3641
3642 Reg = MF.addLiveIn(Reg, RC);
3643 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3644
3645 if (Arg.Flags.isSRet()) {
3646 // The return object should be reasonably addressable.
3647
3648 // FIXME: This helps when the return is a real sret. If it is a
3649 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3650 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3651 unsigned NumBits =
3653 Val = DAG.getNode(
3654 ISD::AssertZext, DL, VT, Val,
3655 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3656 }
3657
3658 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3659 InVals.push_back(Val);
3660 }
3661
3662 // Start adding system SGPRs.
3663 if (IsEntryFunc)
3664 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3665
3666 unsigned StackArgSize = CCInfo.getStackSize();
3667 Info->setBytesInStackArgArea(StackArgSize);
3668
3669 return Chains.empty() ? Chain
3670 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3671}
3672
3673// TODO: If return values can't fit in registers, we should return as many as
3674// possible in registers before passing on stack.
3676 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3677 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3678 const Type *RetTy) const {
3679 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3680 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3681 // for shaders. Vector types should be explicitly handled by CC.
3682 if (AMDGPU::isEntryFunctionCC(CallConv))
3683 return true;
3684
3686 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3687 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3688 return false;
3689
3690 // We must use the stack if return would require unavailable registers.
3691 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3692 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3693 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3694 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3695 return false;
3696
3697 return true;
3698}
3699
3700SDValue
3702 bool isVarArg,
3704 const SmallVectorImpl<SDValue> &OutVals,
3705 const SDLoc &DL, SelectionDAG &DAG) const {
3709
3710 if (AMDGPU::isKernel(CallConv)) {
3711 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3712 OutVals, DL, DAG);
3713 }
3714
3715 bool IsShader = AMDGPU::isShader(CallConv);
3716
3717 Info->setIfReturnsVoid(Outs.empty());
3718 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3719
3720 // CCValAssign - represent the assignment of the return value to a location.
3722
3723 // CCState - Info about the registers and stack slots.
3724 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3725 *DAG.getContext());
3726
3727 // Analyze outgoing return values.
3728 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3729
3730 SDValue Glue;
3732 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3733
3734 SDValue ReadFirstLane =
3735 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3736 // Copy the result values into the output registers.
3737 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3738 ++I, ++RealRVLocIdx) {
3739 CCValAssign &VA = RVLocs[I];
3740 assert(VA.isRegLoc() && "Can only return in registers!");
3741 // TODO: Partially return in registers if return values don't fit.
3742 SDValue Arg = OutVals[RealRVLocIdx];
3743
3744 // Copied from other backends.
3745 switch (VA.getLocInfo()) {
3746 case CCValAssign::Full:
3747 break;
3748 case CCValAssign::BCvt:
3749 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3750 break;
3751 case CCValAssign::SExt:
3752 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3753 break;
3754 case CCValAssign::ZExt:
3755 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3756 break;
3757 case CCValAssign::AExt:
3758 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3759 break;
3760 default:
3761 llvm_unreachable("Unknown loc info!");
3762 }
3763 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3765 ReadFirstLane, Arg);
3766 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3767 Glue = Chain.getValue(1);
3768 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3769 }
3770
3771 // FIXME: Does sret work properly?
3772 if (!Info->isEntryFunction()) {
3773 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3774 const MCPhysReg *I =
3775 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3776 if (I) {
3777 for (; *I; ++I) {
3778 if (AMDGPU::SReg_64RegClass.contains(*I))
3779 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3780 else if (AMDGPU::SReg_32RegClass.contains(*I))
3781 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3782 else
3783 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3784 }
3785 }
3786 }
3787
3788 // Update chain and glue.
3789 RetOps[0] = Chain;
3790 if (Glue.getNode())
3791 RetOps.push_back(Glue);
3792
3793 unsigned Opc = AMDGPUISD::ENDPGM;
3794 if (!IsWaveEnd)
3795 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3796 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3797 : AMDGPUISD::RET_GLUE;
3798 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3799}
3800
3802 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3803 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3804 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3805 SDValue ThisVal) const {
3806 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3807
3808 // Assign locations to each value returned by this call.
3810 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3811 *DAG.getContext());
3812 CCInfo.AnalyzeCallResult(Ins, RetCC);
3813
3814 // Copy all of the result registers out of their specified physreg.
3815 for (CCValAssign VA : RVLocs) {
3816 SDValue Val;
3817
3818 if (VA.isRegLoc()) {
3819 Val =
3820 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3821 Chain = Val.getValue(1);
3822 InGlue = Val.getValue(2);
3823 } else if (VA.isMemLoc()) {
3824 report_fatal_error("TODO: return values in memory");
3825 } else
3826 llvm_unreachable("unknown argument location type");
3827
3828 switch (VA.getLocInfo()) {
3829 case CCValAssign::Full:
3830 break;
3831 case CCValAssign::BCvt:
3832 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3833 break;
3834 case CCValAssign::ZExt:
3835 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3836 DAG.getValueType(VA.getValVT()));
3837 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3838 break;
3839 case CCValAssign::SExt:
3840 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3841 DAG.getValueType(VA.getValVT()));
3842 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3843 break;
3844 case CCValAssign::AExt:
3845 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3846 break;
3847 default:
3848 llvm_unreachable("Unknown loc info!");
3849 }
3850
3851 InVals.push_back(Val);
3852 }
3853
3854 return Chain;
3855}
3856
3857// Add code to pass special inputs required depending on used features separate
3858// from the explicit user arguments present in the IR.
3860 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3861 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3862 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3863 // If we don't have a call site, this was a call inserted by
3864 // legalization. These can never use special inputs.
3865 if (!CLI.CB)
3866 return;
3867
3868 SelectionDAG &DAG = CLI.DAG;
3869 const SDLoc &DL = CLI.DL;
3870 const Function &F = DAG.getMachineFunction().getFunction();
3871
3872 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3873 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3874
3875 const AMDGPUFunctionArgInfo &CalleeArgInfo =
3877
3878 // TODO: Unify with private memory register handling. This is complicated by
3879 // the fact that at least in kernels, the input argument is not necessarily
3880 // in the same location as the input.
3881 // clang-format off
3882 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3883 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3884 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3885 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3886 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3887 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3888 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3889 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3890 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3891 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3892 };
3893 // clang-format on
3894
3895 for (auto [InputID, Attrs] : ImplicitAttrs) {
3896 // If the callee does not use the attribute value, skip copying the value.
3897 if (all_of(Attrs, [&](StringRef Attr) {
3898 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3899 }))
3900 continue;
3901
3902 const auto [OutgoingArg, ArgRC, ArgTy] =
3903 CalleeArgInfo.getPreloadedValue(InputID);
3904 if (!OutgoingArg)
3905 continue;
3906
3907 const auto [IncomingArg, IncomingArgRC, Ty] =
3908 CallerArgInfo.getPreloadedValue(InputID);
3909 assert(IncomingArgRC == ArgRC);
3910
3911 // All special arguments are ints for now.
3912 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3913 SDValue InputReg;
3914
3915 if (IncomingArg) {
3916 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3917 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3918 // The implicit arg ptr is special because it doesn't have a corresponding
3919 // input for kernels, and is computed from the kernarg segment pointer.
3920 InputReg = getImplicitArgPtr(DAG, DL);
3921 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3922 std::optional<uint32_t> Id =
3924 if (Id.has_value()) {
3925 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3926 } else {
3927 InputReg = DAG.getPOISON(ArgVT);
3928 }
3929 } else {
3930 // We may have proven the input wasn't needed, although the ABI is
3931 // requiring it. We just need to allocate the register appropriately.
3932 InputReg = DAG.getPOISON(ArgVT);
3933 }
3934
3935 if (OutgoingArg->isRegister()) {
3936 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3937 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3938 report_fatal_error("failed to allocate implicit input argument");
3939 } else {
3940 unsigned SpecialArgOffset =
3941 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3942 SDValue ArgStore =
3943 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3944 MemOpChains.push_back(ArgStore);
3945 }
3946 }
3947
3948 // Pack workitem IDs into a single register or pass it as is if already
3949 // packed.
3950
3951 auto [OutgoingArg, ArgRC, Ty] =
3953 if (!OutgoingArg)
3954 std::tie(OutgoingArg, ArgRC, Ty) =
3956 if (!OutgoingArg)
3957 std::tie(OutgoingArg, ArgRC, Ty) =
3959 if (!OutgoingArg)
3960 return;
3961
3962 const ArgDescriptor *IncomingArgX = std::get<0>(
3964 const ArgDescriptor *IncomingArgY = std::get<0>(
3966 const ArgDescriptor *IncomingArgZ = std::get<0>(
3968
3969 SDValue InputReg;
3970 SDLoc SL;
3971
3972 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3973 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3974 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3975
3976 // If incoming ids are not packed we need to pack them.
3977 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
3978 NeedWorkItemIDX) {
3979 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3980 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3981 } else {
3982 InputReg = DAG.getConstant(0, DL, MVT::i32);
3983 }
3984 }
3985
3986 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
3987 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3988 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3989 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3990 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3991 InputReg = InputReg.getNode()
3992 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3993 : Y;
3994 }
3995
3996 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
3997 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3998 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3999 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
4000 DAG.getShiftAmountConstant(20, MVT::i32, SL));
4001 InputReg = InputReg.getNode()
4002 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
4003 : Z;
4004 }
4005
4006 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4007 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4008 // We're in a situation where the outgoing function requires the workitem
4009 // ID, but the calling function does not have it (e.g a graphics function
4010 // calling a C calling convention function). This is illegal, but we need
4011 // to produce something.
4012 InputReg = DAG.getPOISON(MVT::i32);
4013 } else {
4014 // Workitem ids are already packed, any of present incoming arguments
4015 // will carry all required fields.
4016 ArgDescriptor IncomingArg =
4017 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
4018 : IncomingArgY ? *IncomingArgY
4019 : *IncomingArgZ,
4020 ~0u);
4021 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
4022 }
4023 }
4024
4025 if (OutgoingArg->isRegister()) {
4026 if (InputReg)
4027 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4028
4029 CCInfo.AllocateReg(OutgoingArg->getRegister());
4030 } else {
4031 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
4032 if (InputReg) {
4033 SDValue ArgStore =
4034 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
4035 MemOpChains.push_back(ArgStore);
4036 }
4037 }
4038}
4039
4041 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4043 const SmallVectorImpl<SDValue> &OutVals,
4044 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4045 if (AMDGPU::isChainCC(CalleeCC))
4046 return true;
4047
4048 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
4049 return false;
4050
4051 // For a divergent call target, we need to do a waterfall loop over the
4052 // possible callees which precludes us from using a simple jump.
4053 if (Callee->isDivergent())
4054 return false;
4055
4057 const Function &CallerF = MF.getFunction();
4058 CallingConv::ID CallerCC = CallerF.getCallingConv();
4060 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4061
4062 // Kernels aren't callable, and don't have a live in return address so it
4063 // doesn't make sense to do a tail call with entry functions.
4064 if (!CallerPreserved)
4065 return false;
4066
4067 bool CCMatch = CallerCC == CalleeCC;
4068
4070 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
4071 return true;
4072 return false;
4073 }
4074
4075 // TODO: Can we handle var args?
4076 if (IsVarArg)
4077 return false;
4078
4079 for (const Argument &Arg : CallerF.args()) {
4080 if (Arg.hasByValAttr())
4081 return false;
4082 }
4083
4084 LLVMContext &Ctx = *DAG.getContext();
4085
4086 // Check that the call results are passed in the same way.
4087 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4088 CCAssignFnForCall(CalleeCC, IsVarArg),
4089 CCAssignFnForCall(CallerCC, IsVarArg)))
4090 return false;
4091
4092 // The callee has to preserve all registers the caller needs to preserve.
4093 if (!CCMatch) {
4094 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4095 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4096 return false;
4097 }
4098
4099 // Nothing more to check if the callee is taking no arguments.
4100 if (Outs.empty())
4101 return true;
4102
4104 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4105
4106 // FIXME: We are not allocating special input registers, so we will be
4107 // deciding based on incorrect register assignments.
4108 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4109
4110 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4111 // If the stack arguments for this call do not fit into our own save area then
4112 // the call cannot be made tail.
4113 // TODO: Is this really necessary?
4114 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4115 return false;
4116
4117 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4118 // FIXME: What about inreg arguments that end up passed in memory?
4119 if (!CCVA.isRegLoc())
4120 continue;
4121
4122 // If we are passing an argument in an SGPR, and the value is divergent,
4123 // this call requires a waterfall loop.
4124 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4125 LLVM_DEBUG(
4126 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4127 << printReg(CCVA.getLocReg(), TRI) << '\n');
4128 return false;
4129 }
4130 }
4131
4132 const MachineRegisterInfo &MRI = MF.getRegInfo();
4133 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4134}
4135
4137 if (!CI->isTailCall())
4138 return false;
4139
4140 const Function *ParentFn = CI->getFunction();
4142 return false;
4143 return true;
4144}
4145
4146namespace {
4147// Chain calls have special arguments that we need to handle. These are
4148// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4149// arguments (index 0 and 1 respectively).
4150enum ChainCallArgIdx {
4151 Exec = 2,
4152 Flags,
4153 NumVGPRs,
4154 FallbackExec,
4155 FallbackCallee
4156};
4157} // anonymous namespace
4158
4159// The wave scratch offset register is used as the global base pointer.
4161 SmallVectorImpl<SDValue> &InVals) const {
4162 CallingConv::ID CallConv = CLI.CallConv;
4163 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4164
4165 SelectionDAG &DAG = CLI.DAG;
4166
4167 const SDLoc &DL = CLI.DL;
4168 SDValue Chain = CLI.Chain;
4169 SDValue Callee = CLI.Callee;
4170
4171 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4172 bool UsesDynamicVGPRs = false;
4173 if (IsChainCallConv) {
4174 // The last arguments should be the value that we need to put in EXEC,
4175 // followed by the flags and any other arguments with special meanings.
4176 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4177 // we don't treat them like the "real" arguments.
4178 auto RequestedExecIt =
4179 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4180 return Arg.OrigArgIndex == 2;
4181 });
4182 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4183
4184 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4185 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4186 CLI.OutVals.end());
4187 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4188
4189 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4190 "Haven't popped all the special args");
4191
4192 TargetLowering::ArgListEntry RequestedExecArg =
4193 CLI.Args[ChainCallArgIdx::Exec];
4194 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4195 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4196
4197 // Convert constants into TargetConstants, so they become immediate operands
4198 // instead of being selected into S_MOV.
4199 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4200 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4201 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4202 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4203 } else
4204 ChainCallSpecialArgs.push_back(Arg.Node);
4205 };
4206
4207 PushNodeOrTargetConstant(RequestedExecArg);
4208
4209 // Process any other special arguments depending on the value of the flags.
4210 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4211
4212 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4213 if (FlagsValue.isZero()) {
4214 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4215 return lowerUnhandledCall(CLI, InVals,
4216 "no additional args allowed if flags == 0");
4217 } else if (FlagsValue.isOneBitSet(0)) {
4218 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4219 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4220 }
4221
4222 if (!Subtarget->isWave32()) {
4223 return lowerUnhandledCall(
4224 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4225 }
4226
4227 UsesDynamicVGPRs = true;
4228 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4229 CLI.Args.end(), PushNodeOrTargetConstant);
4230 }
4231 }
4232
4234 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4236 bool &IsTailCall = CLI.IsTailCall;
4237 bool IsVarArg = CLI.IsVarArg;
4238 bool IsSibCall = false;
4240
4241 if (Callee.isUndef() || isNullConstant(Callee)) {
4242 if (!CLI.IsTailCall) {
4243 for (ISD::InputArg &Arg : CLI.Ins)
4244 InVals.push_back(DAG.getPOISON(Arg.VT));
4245 }
4246
4247 return Chain;
4248 }
4249
4250 if (IsVarArg) {
4251 return lowerUnhandledCall(CLI, InVals,
4252 "unsupported call to variadic function ");
4253 }
4254
4255 if (!CLI.CB)
4256 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4257
4258 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4259 return lowerUnhandledCall(CLI, InVals,
4260 "unsupported required tail call to function ");
4261 }
4262
4263 if (IsTailCall) {
4264 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4265 Outs, OutVals, Ins, DAG);
4266 if (!IsTailCall &&
4267 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4268 report_fatal_error("failed to perform tail call elimination on a call "
4269 "site marked musttail or on llvm.amdgcn.cs.chain");
4270 }
4271
4272 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4273
4274 // A sibling call is one where we're under the usual C ABI and not planning
4275 // to change that but can still do a tail call:
4276 if (!TailCallOpt && IsTailCall)
4277 IsSibCall = true;
4278
4279 if (IsTailCall)
4280 ++NumTailCalls;
4281 }
4282
4285 SmallVector<SDValue, 8> MemOpChains;
4286
4287 // Analyze operands of the call, assigning locations to each operand.
4289 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4290 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4291
4292 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4294 // With a fixed ABI, allocate fixed registers before user arguments.
4295 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4296 }
4297
4298 // Mark the scratch resource descriptor as allocated so the CC analysis
4299 // does not assign user arguments to these registers, matching the callee.
4300 if (!Subtarget->hasFlatScratchEnabled())
4301 CCInfo.AllocateReg(Info->getScratchRSrcReg());
4302
4303 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4304
4305 // Get a count of how many bytes are to be pushed on the stack.
4306 unsigned NumBytes = CCInfo.getStackSize();
4307
4308 if (IsSibCall) {
4309 // Since we're not changing the ABI to make this a tail call, the memory
4310 // operands are already available in the caller's incoming argument space.
4311 NumBytes = 0;
4312 }
4313
4314 // FPDiff is the byte offset of the call's argument area from the callee's.
4315 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4316 // by this amount for a tail call. In a sibling call it must be 0 because the
4317 // caller will deallocate the entire stack and the callee still expects its
4318 // arguments to begin at SP+0. Completely unused for non-tail calls.
4319 int32_t FPDiff = 0;
4320 MachineFrameInfo &MFI = MF.getFrameInfo();
4321 auto *TRI = Subtarget->getRegisterInfo();
4322
4323 // Adjust the stack pointer for the new arguments...
4324 // These operations are automatically eliminated by the prolog/epilog pass
4325 if (!IsSibCall)
4326 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4327
4328 if (!IsSibCall || IsChainCallConv) {
4329 if (!Subtarget->hasFlatScratchEnabled()) {
4330 SmallVector<SDValue, 4> CopyFromChains;
4331
4332 // In the HSA case, this should be an identity copy.
4333 SDValue ScratchRSrcReg =
4334 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4335 RegsToPass.emplace_back(IsChainCallConv
4336 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4337 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4338 ScratchRSrcReg);
4339 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4340 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4341 }
4342 }
4343
4344 const unsigned NumSpecialInputs = RegsToPass.size();
4345
4346 MVT PtrVT = MVT::i32;
4347
4348 // Walk the register/memloc assignments, inserting copies/loads.
4349 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4350 CCValAssign &VA = ArgLocs[i];
4351 SDValue Arg = OutVals[i];
4352
4353 // Promote the value if needed.
4354 switch (VA.getLocInfo()) {
4355 case CCValAssign::Full:
4356 break;
4357 case CCValAssign::BCvt:
4358 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4359 break;
4360 case CCValAssign::ZExt:
4361 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4362 break;
4363 case CCValAssign::SExt:
4364 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4365 break;
4366 case CCValAssign::AExt:
4367 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4368 break;
4369 case CCValAssign::FPExt:
4370 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4371 break;
4372 default:
4373 llvm_unreachable("Unknown loc info!");
4374 }
4375
4376 if (VA.isRegLoc()) {
4377 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4378 } else {
4379 assert(VA.isMemLoc());
4380
4381 SDValue DstAddr;
4382 MachinePointerInfo DstInfo;
4383
4384 unsigned LocMemOffset = VA.getLocMemOffset();
4385 int32_t Offset = LocMemOffset;
4386
4387 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4388 MaybeAlign Alignment;
4389
4390 if (IsTailCall) {
4391 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4392 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4393 : VA.getValVT().getStoreSize();
4394
4395 // FIXME: We can have better than the minimum byval required alignment.
4396 Alignment =
4397 Flags.isByVal()
4398 ? Flags.getNonZeroByValAlign()
4399 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4400
4401 Offset = Offset + FPDiff;
4402 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4403
4404 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4405 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4406
4407 // Make sure any stack arguments overlapping with where we're storing
4408 // are loaded before this eventual operation. Otherwise they'll be
4409 // clobbered.
4410
4411 // FIXME: Why is this really necessary? This seems to just result in a
4412 // lot of code to copy the stack and write them back to the same
4413 // locations, which are supposed to be immutable?
4414 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4415 } else {
4416 // Stores to the argument stack area are relative to the stack pointer.
4417 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4418 MVT::i32);
4419 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4420 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4421 Alignment =
4422 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4423 }
4424
4425 if (Outs[i].Flags.isByVal()) {
4426 SDValue SizeNode =
4427 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4428 SDValue Cpy =
4429 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4430 Outs[i].Flags.getNonZeroByValAlign(),
4431 /*isVol = */ false, /*AlwaysInline = */ true,
4432 /*CI=*/nullptr, std::nullopt, DstInfo,
4434
4435 MemOpChains.push_back(Cpy);
4436 } else {
4437 SDValue Store =
4438 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4439 MemOpChains.push_back(Store);
4440 }
4441 }
4442 }
4443
4444 if (!MemOpChains.empty())
4445 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4446
4447 SDValue ReadFirstLaneID =
4448 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4449
4450 SDValue TokenGlue;
4451 if (CLI.ConvergenceControlToken) {
4452 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4454 }
4455
4456 // Build a sequence of copy-to-reg nodes chained together with token chain
4457 // and flag operands which copy the outgoing args into the appropriate regs.
4458 SDValue InGlue;
4459
4460 unsigned ArgIdx = 0;
4461 for (auto [Reg, Val] : RegsToPass) {
4462 if (ArgIdx++ >= NumSpecialInputs &&
4463 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4464 // For chain calls, the inreg arguments are required to be
4465 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4466 // they are uniform.
4467 //
4468 // For other calls, if an inreg arguments is known to be uniform,
4469 // speculatively insert a readfirstlane in case it is in a VGPR.
4470 //
4471 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4472 // value, so let that continue to produce invalid code.
4473
4474 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4475 if (TokenGlue)
4476 ReadfirstlaneArgs.push_back(TokenGlue);
4478 ReadfirstlaneArgs);
4479 }
4480
4481 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4482 InGlue = Chain.getValue(1);
4483 }
4484
4485 // We don't usually want to end the call-sequence here because we would tidy
4486 // the frame up *after* the call, however in the ABI-changing tail-call case
4487 // we've carefully laid out the parameters so that when sp is reset they'll be
4488 // in the correct location.
4489 if (IsTailCall && !IsSibCall) {
4490 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4491 InGlue = Chain.getValue(1);
4492 }
4493
4494 std::vector<SDValue> Ops({Chain});
4495
4496 // Add a redundant copy of the callee global which will not be legalized, as
4497 // we need direct access to the callee later.
4499 const GlobalValue *GV = GSD->getGlobal();
4500 Ops.push_back(Callee);
4501 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4502 } else {
4503 if (IsTailCall) {
4504 // isEligibleForTailCallOptimization considered whether the call target is
4505 // divergent, but we may still end up with a uniform value in a VGPR.
4506 // Insert a readfirstlane just in case.
4507 SDValue ReadFirstLaneID =
4508 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4509
4510 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4511 if (TokenGlue)
4512 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4513 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4514 ReadfirstlaneArgs);
4515 }
4516
4517 Ops.push_back(Callee);
4518 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4519 }
4520
4521 if (IsTailCall) {
4522 // Each tail call may have to adjust the stack by a different amount, so
4523 // this information must travel along with the operation for eventual
4524 // consumption by emitEpilogue.
4525 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4526 }
4527
4528 if (IsChainCallConv)
4529 llvm::append_range(Ops, ChainCallSpecialArgs);
4530
4531 // Add argument registers to the end of the list so that they are known live
4532 // into the call.
4533 for (auto &[Reg, Val] : RegsToPass)
4534 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4535
4536 // Add a register mask operand representing the call-preserved registers.
4537 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4538 assert(Mask && "Missing call preserved mask for calling convention");
4539 Ops.push_back(DAG.getRegisterMask(Mask));
4540
4541 if (SDValue Token = CLI.ConvergenceControlToken) {
4543 GlueOps.push_back(Token);
4544 if (InGlue)
4545 GlueOps.push_back(InGlue);
4546
4547 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4548 MVT::Glue, GlueOps),
4549 0);
4550 }
4551
4552 if (InGlue)
4553 Ops.push_back(InGlue);
4554
4555 // If we're doing a tall call, use a TC_RETURN here rather than an
4556 // actual call instruction.
4557 if (IsTailCall) {
4558 MFI.setHasTailCall();
4559 unsigned OPC = AMDGPUISD::TC_RETURN;
4560 switch (CallConv) {
4562 OPC = AMDGPUISD::TC_RETURN_GFX;
4563 break;
4566 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4567 : AMDGPUISD::TC_RETURN_CHAIN;
4568 break;
4569 }
4570
4571 // If the caller is a whole wave function, we need to use a special opcode
4572 // so we can patch up EXEC.
4573 if (Info->isWholeWaveFunction())
4574 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4575
4576 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4577 }
4578
4579 // Returns a chain and a flag for retval copy to use.
4580 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4581 Chain = Call.getValue(0);
4582 InGlue = Call.getValue(1);
4583
4584 uint64_t CalleePopBytes = NumBytes;
4585 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4586 if (!Ins.empty())
4587 InGlue = Chain.getValue(1);
4588
4589 // Handle result values, copying them out of physregs into vregs that we
4590 // return.
4591 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4592 InVals, /*IsThisReturn=*/false, SDValue());
4593}
4594
4595// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4596// except for:
4597// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4598// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4600 SelectionDAG &DAG) const {
4601 const MachineFunction &MF = DAG.getMachineFunction();
4603
4604 SDLoc dl(Op);
4605 EVT VT = Op.getValueType();
4606 SDValue Chain = Op.getOperand(0);
4607 Register SPReg = Info->getStackPtrOffsetReg();
4608
4609 // Chain the dynamic stack allocation so that it doesn't modify the stack
4610 // pointer when other instructions are using the stack.
4611 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4612
4613 SDValue Size = Op.getOperand(1);
4614 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4615 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4616
4617 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4619 "Stack grows upwards for AMDGPU");
4620
4621 Chain = BaseAddr.getValue(1);
4622 Align StackAlign = TFL->getStackAlign();
4623 if (Alignment > StackAlign) {
4624 uint64_t ScaledAlignment = Alignment.value()
4625 << Subtarget->getWavefrontSizeLog2();
4626 uint64_t StackAlignMask = ScaledAlignment - 1;
4627 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4628 DAG.getConstant(StackAlignMask, dl, VT));
4629 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4630 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4631 }
4632
4633 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4634 SDValue NewSP;
4636 // For constant sized alloca, scale alloca size by wave-size
4637 SDValue ScaledSize = DAG.getNode(
4638 ISD::SHL, dl, VT, Size,
4639 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4640 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4641 } else {
4642 // For dynamic sized alloca, perform wave-wide reduction to get max of
4643 // alloca size(divergent) and then scale it by wave-size
4644 SDValue WaveReduction =
4645 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4646 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4647 Size, DAG.getConstant(0, dl, MVT::i32));
4648 SDValue ScaledSize = DAG.getNode(
4649 ISD::SHL, dl, VT, Size,
4650 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4651 NewSP =
4652 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4653 SDValue ReadFirstLaneID =
4654 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4655 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4656 NewSP);
4657 }
4658
4659 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4660 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4661
4662 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4663}
4664
4666 if (Op.getValueType() != MVT::i32)
4667 return Op; // Defer to cannot select error.
4668
4670 SDLoc SL(Op);
4671
4672 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4673
4674 // Convert from wave uniform to swizzled vector address. This should protect
4675 // from any edge cases where the stacksave result isn't directly used with
4676 // stackrestore.
4677 SDValue VectorAddress =
4678 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4679 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4680}
4681
4683 SelectionDAG &DAG) const {
4684 SDLoc SL(Op);
4685 assert(Op.getValueType() == MVT::i32);
4686
4687 uint32_t BothRoundHwReg =
4689 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4690
4691 SDValue IntrinID =
4692 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4693 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4694 Op.getOperand(0), IntrinID, GetRoundBothImm);
4695
4696 // There are two rounding modes, one for f32 and one for f64/f16. We only
4697 // report in the standard value range if both are the same.
4698 //
4699 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4700 // ties away from zero is not supported, and the other values are rotated by
4701 // 1.
4702 //
4703 // If the two rounding modes are not the same, report a target defined value.
4704
4705 // Mode register rounding mode fields:
4706 //
4707 // [1:0] Single-precision round mode.
4708 // [3:2] Double/Half-precision round mode.
4709 //
4710 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4711 //
4712 // Hardware Spec
4713 // Toward-0 3 0
4714 // Nearest Even 0 1
4715 // +Inf 1 2
4716 // -Inf 2 3
4717 // NearestAway0 N/A 4
4718 //
4719 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4720 // table we can index by the raw hardware mode.
4721 //
4722 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4723
4724 SDValue BitTable =
4726
4727 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4728 SDValue RoundModeTimesNumBits =
4729 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4730
4731 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4732 // knew only one mode was demanded.
4733 SDValue TableValue =
4734 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4735 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4736
4737 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4738 SDValue TableEntry =
4739 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4740
4741 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4742 // if it's an extended value.
4743 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4744 SDValue IsStandardValue =
4745 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4746 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4747 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4748 TableEntry, EnumOffset);
4749
4750 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4751}
4752
4754 SelectionDAG &DAG) const {
4755 SDLoc SL(Op);
4756
4757 SDValue NewMode = Op.getOperand(1);
4758 assert(NewMode.getValueType() == MVT::i32);
4759
4760 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4761 // hardware MODE.fp_round values.
4762 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4763 uint32_t ClampedVal = std::min(
4764 static_cast<uint32_t>(ConstMode->getZExtValue()),
4766 NewMode = DAG.getConstant(
4767 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4768 } else {
4769 // If we know the input can only be one of the supported standard modes in
4770 // the range 0-3, we can use a simplified mapping to hardware values.
4771 KnownBits KB = DAG.computeKnownBits(NewMode);
4772 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4773 // The supported standard values are 0-3. The extended values start at 8. We
4774 // need to offset by 4 if the value is in the extended range.
4775
4776 if (UseReducedTable) {
4777 // Truncate to the low 32-bits.
4778 SDValue BitTable = DAG.getConstant(
4779 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4780
4781 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4782 SDValue RoundModeTimesNumBits =
4783 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4784
4785 NewMode =
4786 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4787
4788 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4789 // the table extracted bits into inline immediates.
4790 } else {
4791 // table_index = umin(value, value - 4)
4792 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4793 SDValue BitTable =
4795
4796 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4797 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4798 SDValue IndexVal =
4799 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4800
4801 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4802 SDValue RoundModeTimesNumBits =
4803 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4804
4805 SDValue TableValue =
4806 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4807 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4808
4809 // No need to mask out the high bits since the setreg will ignore them
4810 // anyway.
4811 NewMode = TruncTable;
4812 }
4813
4814 // Insert a readfirstlane in case the value is a VGPR. We could do this
4815 // earlier and keep more operations scalar, but that interferes with
4816 // combining the source.
4817 SDValue ReadFirstLaneID =
4818 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4819 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4820 ReadFirstLaneID, NewMode);
4821 }
4822
4823 // N.B. The setreg will be later folded into s_round_mode on supported
4824 // targets.
4825 SDValue IntrinID =
4826 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4827 uint32_t BothRoundHwReg =
4829 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4830
4831 SDValue SetReg =
4832 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4833 IntrinID, RoundBothImm, NewMode);
4834
4835 return SetReg;
4836}
4837
4839 if (Op->isDivergent() &&
4840 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4841 // Cannot do I$ prefetch with divergent pointer.
4842 return SDValue();
4843
4844 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4848 break;
4850 if (Subtarget->hasSafeSmemPrefetch())
4851 break;
4852 [[fallthrough]];
4853 default:
4854 return SDValue();
4855 }
4856
4857 // I$ prefetch
4858 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4859 return SDValue();
4860
4861 return Op;
4862}
4863
4864// Work around DAG legality rules only based on the result type.
4866 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4867 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4868 EVT SrcVT = Src.getValueType();
4869
4870 if (SrcVT.getScalarType() != MVT::bf16)
4871 return Op;
4872
4873 SDLoc SL(Op);
4874 SDValue BitCast =
4875 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4876
4877 EVT DstVT = Op.getValueType();
4878 if (IsStrict)
4879 llvm_unreachable("Need STRICT_BF16_TO_FP");
4880
4881 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4882}
4883
4885 SDLoc SL(Op);
4886 if (Op.getValueType() != MVT::i64)
4887 return Op;
4888
4889 uint32_t ModeHwReg =
4891 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4892 uint32_t TrapHwReg =
4894 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4895
4896 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4897 SDValue IntrinID =
4898 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4899 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4900 Op.getOperand(0), IntrinID, ModeHwRegImm);
4901 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4902 Op.getOperand(0), IntrinID, TrapHwRegImm);
4903 SDValue TokenReg =
4904 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4905 GetTrapReg.getValue(1));
4906
4907 SDValue CvtPtr =
4908 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4909 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4910
4911 return DAG.getMergeValues({Result, TokenReg}, SL);
4912}
4913
4915 SDLoc SL(Op);
4916 if (Op.getOperand(1).getValueType() != MVT::i64)
4917 return Op;
4918
4919 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4920 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4921 DAG.getConstant(0, SL, MVT::i32));
4922 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4923 DAG.getConstant(1, SL, MVT::i32));
4924
4925 SDValue ReadFirstLaneID =
4926 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4927 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4928 ReadFirstLaneID, NewModeReg);
4929 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4930 ReadFirstLaneID, NewTrapReg);
4931
4932 unsigned ModeHwReg =
4934 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4935 unsigned TrapHwReg =
4937 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4938
4939 SDValue IntrinID =
4940 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4941 SDValue SetModeReg =
4942 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4943 IntrinID, ModeHwRegImm, NewModeReg);
4944 SDValue SetTrapReg =
4945 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4946 IntrinID, TrapHwRegImm, NewTrapReg);
4947 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4948}
4949
4951 const MachineFunction &MF) const {
4952 const Function &Fn = MF.getFunction();
4953
4955 .Case("m0", AMDGPU::M0)
4956 .Case("exec", AMDGPU::EXEC)
4957 .Case("exec_lo", AMDGPU::EXEC_LO)
4958 .Case("exec_hi", AMDGPU::EXEC_HI)
4959 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4960 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4961 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4962 .Default(Register());
4963 if (!Reg)
4964 return Reg;
4965
4966 if (!Subtarget->hasFlatScrRegister() &&
4967 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4968 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4969 "\" for subtarget."));
4970 }
4971
4972 switch (Reg) {
4973 case AMDGPU::M0:
4974 case AMDGPU::EXEC_LO:
4975 case AMDGPU::EXEC_HI:
4976 case AMDGPU::FLAT_SCR_LO:
4977 case AMDGPU::FLAT_SCR_HI:
4978 if (VT.getSizeInBits() == 32)
4979 return Reg;
4980 break;
4981 case AMDGPU::EXEC:
4982 case AMDGPU::FLAT_SCR:
4983 if (VT.getSizeInBits() == 64)
4984 return Reg;
4985 break;
4986 default:
4987 llvm_unreachable("missing register type checking");
4988 }
4989
4991 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4992}
4993
4994// If kill is not the last instruction, split the block so kill is always a
4995// proper terminator.
4998 MachineBasicBlock *BB) const {
4999 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
5001 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
5002 return SplitBB;
5003}
5004
5005// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
5006// \p MI will be the only instruction in the loop body block. Otherwise, it will
5007// be the first instruction in the remainder block.
5008//
5009/// \returns { LoopBody, Remainder }
5010static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5012 MachineFunction *MF = MBB.getParent();
5014
5015 // To insert the loop we need to split the block. Move everything after this
5016 // point to a new block, and insert a new empty block between the two.
5018 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
5020 ++MBBI;
5021
5022 MF->insert(MBBI, LoopBB);
5023 MF->insert(MBBI, RemainderBB);
5024
5025 LoopBB->addSuccessor(LoopBB);
5026 LoopBB->addSuccessor(RemainderBB);
5027
5028 // Move the rest of the block into a new block.
5029 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
5030
5031 if (InstInLoop) {
5032 auto Next = std::next(I);
5033
5034 // Move instruction to loop body.
5035 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
5036
5037 // Move the rest of the block.
5038 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
5039 } else {
5040 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
5041 }
5042
5043 MBB.addSuccessor(LoopBB);
5044
5045 return std::pair(LoopBB, RemainderBB);
5046}
5047
5048/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5050 MachineBasicBlock *MBB = MI.getParent();
5052 auto I = MI.getIterator();
5053 auto E = std::next(I);
5054
5055 // clang-format off
5056 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
5057 .addImm(0);
5058 // clang-format on
5059
5060 MIBundleBuilder Bundler(*MBB, I, E);
5061 finalizeBundle(*MBB, Bundler.begin());
5062}
5063
5066 MachineBasicBlock *BB) const {
5067 const DebugLoc &DL = MI.getDebugLoc();
5068
5070
5072
5073 // Apparently kill flags are only valid if the def is in the same block?
5074 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
5075 Src->setIsKill(false);
5076
5077 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5078
5079 MachineBasicBlock::iterator I = LoopBB->end();
5080
5081 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5083
5084 // Clear TRAP_STS.MEM_VIOL
5085 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5086 .addImm(0)
5087 .addImm(EncodedReg);
5088
5090
5091 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5092
5093 // Load and check TRAP_STS.MEM_VIOL
5094 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5095 .addImm(EncodedReg);
5096
5097 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5098 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5099 .addReg(Reg, RegState::Kill)
5100 .addImm(0);
5101 // clang-format off
5102 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5103 .addMBB(LoopBB);
5104 // clang-format on
5105
5106 return RemainderBB;
5107}
5108
5109// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5110// wavefront. If the value is uniform and just happens to be in a VGPR, this
5111// will only do one iteration. In the worst case, this will loop 64 times.
5112//
5113// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5116 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5117 const DebugLoc &DL, const MachineOperand &Idx,
5118 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5119 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5120 Register &SGPRIdxReg) {
5121
5122 MachineFunction *MF = OrigBB.getParent();
5123 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5124 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5127
5128 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5129 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5130 Register NewExec = MRI.createVirtualRegister(BoolRC);
5131 Register CurrentIdxReg =
5132 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5133 Register CondReg = MRI.createVirtualRegister(BoolRC);
5134
5135 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5136 .addReg(InitReg)
5137 .addMBB(&OrigBB)
5138 .addReg(ResultReg)
5139 .addMBB(&LoopBB);
5140
5141 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5142 .addReg(InitSaveExecReg)
5143 .addMBB(&OrigBB)
5144 .addReg(NewExec)
5145 .addMBB(&LoopBB);
5146
5147 // Read the next variant <- also loop target.
5148 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5149 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5150
5151 // Compare the just read M0 value to all possible Idx values.
5152 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5153 .addReg(CurrentIdxReg)
5154 .addReg(Idx.getReg(), {}, Idx.getSubReg());
5155
5156 // Update EXEC, save the original EXEC value to VCC.
5157 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5158 .addReg(CondReg, RegState::Kill);
5159
5160 MRI.setSimpleHint(NewExec, CondReg);
5161
5162 if (UseGPRIdxMode) {
5163 if (Offset == 0) {
5164 SGPRIdxReg = CurrentIdxReg;
5165 } else {
5166 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5167 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5168 .addReg(CurrentIdxReg, RegState::Kill)
5169 .addImm(Offset);
5170 }
5171 } else {
5172 // Move index from VCC into M0
5173 if (Offset == 0) {
5174 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5175 .addReg(CurrentIdxReg, RegState::Kill);
5176 } else {
5177 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5178 .addReg(CurrentIdxReg, RegState::Kill)
5179 .addImm(Offset);
5180 }
5181 }
5182
5183 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5184 MachineInstr *InsertPt =
5185 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5186 .addReg(LMC.ExecReg)
5187 .addReg(NewExec);
5188
5189 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5190 // s_cbranch_scc0?
5191
5192 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5193 // clang-format off
5194 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5195 .addMBB(&LoopBB);
5196 // clang-format on
5197
5198 return InsertPt->getIterator();
5199}
5200
5201// This has slightly sub-optimal regalloc when the source vector is killed by
5202// the read. The register allocator does not understand that the kill is
5203// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5204// subregister from it, using 1 more VGPR than necessary. This was saved when
5205// this was expanded after register allocation.
5208 unsigned InitResultReg, unsigned PhiReg, int Offset,
5209 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5210 MachineFunction *MF = MBB.getParent();
5211 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5212 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5214 const DebugLoc &DL = MI.getDebugLoc();
5216
5217 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5218 Register DstReg = MI.getOperand(0).getReg();
5219 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5220 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5222
5223 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5224
5225 // Save the EXEC mask
5226 // clang-format off
5227 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5228 .addReg(LMC.ExecReg);
5229 // clang-format on
5230
5231 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5232
5233 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5234
5235 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5236 InitResultReg, DstReg, PhiReg, TmpExec,
5237 Offset, UseGPRIdxMode, SGPRIdxReg);
5238
5239 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5241 ++MBBI;
5242 MF->insert(MBBI, LandingPad);
5243 LoopBB->removeSuccessor(RemainderBB);
5244 LandingPad->addSuccessor(RemainderBB);
5245 LoopBB->addSuccessor(LandingPad);
5246 MachineBasicBlock::iterator First = LandingPad->begin();
5247 // clang-format off
5248 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5249 .addReg(SaveExec);
5250 // clang-format on
5251
5252 return InsPt;
5253}
5254
5255// Returns subreg index, offset
5256static std::pair<unsigned, int>
5258 const TargetRegisterClass *SuperRC, unsigned VecReg,
5259 int Offset) {
5260 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5261
5262 // Skip out of bounds offsets, or else we would end up using an undefined
5263 // register.
5264 if (Offset >= NumElts || Offset < 0)
5265 return std::pair(AMDGPU::sub0, Offset);
5266
5267 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5268}
5269
5272 int Offset) {
5273 MachineBasicBlock *MBB = MI.getParent();
5274 const DebugLoc &DL = MI.getDebugLoc();
5276
5277 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5278
5279 assert(Idx->getReg() != AMDGPU::NoRegister);
5280
5281 if (Offset == 0) {
5282 // clang-format off
5283 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5284 .add(*Idx);
5285 // clang-format on
5286 } else {
5287 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5288 .add(*Idx)
5289 .addImm(Offset);
5290 }
5291}
5292
5295 int Offset) {
5296 MachineBasicBlock *MBB = MI.getParent();
5297 const DebugLoc &DL = MI.getDebugLoc();
5299
5300 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5301
5302 if (Offset == 0)
5303 return Idx->getReg();
5304
5305 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5306 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5307 .add(*Idx)
5308 .addImm(Offset);
5309 return Tmp;
5310}
5311
5314 const GCNSubtarget &ST) {
5315 const SIInstrInfo *TII = ST.getInstrInfo();
5316 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5317 MachineFunction *MF = MBB.getParent();
5319
5320 Register Dst = MI.getOperand(0).getReg();
5321 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5322 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5323 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5324
5325 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5326 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5327
5328 unsigned SubReg;
5329 std::tie(SubReg, Offset) =
5330 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5331
5332 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5333
5334 // Check for a SGPR index.
5335 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5337 const DebugLoc &DL = MI.getDebugLoc();
5338
5339 if (UseGPRIdxMode) {
5340 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5341 // to avoid interfering with other uses, so probably requires a new
5342 // optimization pass.
5344
5345 const MCInstrDesc &GPRIDXDesc =
5346 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5347 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5348 .addReg(SrcReg)
5349 .addReg(Idx)
5350 .addImm(SubReg);
5351 } else {
5353
5354 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5355 .addReg(SrcReg, {}, SubReg)
5356 .addReg(SrcReg, RegState::Implicit);
5357 }
5358
5359 MI.eraseFromParent();
5360
5361 return &MBB;
5362 }
5363
5364 // Control flow needs to be inserted if indexing with a VGPR.
5365 const DebugLoc &DL = MI.getDebugLoc();
5367
5368 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5369 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5370
5371 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5372
5373 Register SGPRIdxReg;
5374 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5375 UseGPRIdxMode, SGPRIdxReg);
5376
5377 MachineBasicBlock *LoopBB = InsPt->getParent();
5378
5379 if (UseGPRIdxMode) {
5380 const MCInstrDesc &GPRIDXDesc =
5381 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5382
5383 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5384 .addReg(SrcReg)
5385 .addReg(SGPRIdxReg)
5386 .addImm(SubReg);
5387 } else {
5388 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5389 .addReg(SrcReg, {}, SubReg)
5390 .addReg(SrcReg, RegState::Implicit);
5391 }
5392
5393 MI.eraseFromParent();
5394
5395 return LoopBB;
5396}
5397
5400 const GCNSubtarget &ST) {
5401 const SIInstrInfo *TII = ST.getInstrInfo();
5402 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5403 MachineFunction *MF = MBB.getParent();
5405
5406 Register Dst = MI.getOperand(0).getReg();
5407 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5408 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5409 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5410 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5411 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5412 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5413
5414 // This can be an immediate, but will be folded later.
5415 assert(Val->getReg());
5416
5417 unsigned SubReg;
5418 std::tie(SubReg, Offset) =
5419 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5420 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5421
5422 if (Idx->getReg() == AMDGPU::NoRegister) {
5424 const DebugLoc &DL = MI.getDebugLoc();
5425
5426 assert(Offset == 0);
5427
5428 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5429 .add(*SrcVec)
5430 .add(*Val)
5431 .addImm(SubReg);
5432
5433 MI.eraseFromParent();
5434 return &MBB;
5435 }
5436
5437 // Check for a SGPR index.
5438 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5440 const DebugLoc &DL = MI.getDebugLoc();
5441
5442 if (UseGPRIdxMode) {
5444
5445 const MCInstrDesc &GPRIDXDesc =
5446 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5447 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5448 .addReg(SrcVec->getReg())
5449 .add(*Val)
5450 .addReg(Idx)
5451 .addImm(SubReg);
5452 } else {
5454
5455 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5456 TRI.getRegSizeInBits(*VecRC), 32, false);
5457 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5458 .addReg(SrcVec->getReg())
5459 .add(*Val)
5460 .addImm(SubReg);
5461 }
5462 MI.eraseFromParent();
5463 return &MBB;
5464 }
5465
5466 // Control flow needs to be inserted if indexing with a VGPR.
5467 if (Val->isReg())
5468 MRI.clearKillFlags(Val->getReg());
5469
5470 const DebugLoc &DL = MI.getDebugLoc();
5471
5472 Register PhiReg = MRI.createVirtualRegister(VecRC);
5473
5474 Register SGPRIdxReg;
5475 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5476 UseGPRIdxMode, SGPRIdxReg);
5477 MachineBasicBlock *LoopBB = InsPt->getParent();
5478
5479 if (UseGPRIdxMode) {
5480 const MCInstrDesc &GPRIDXDesc =
5481 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5482
5483 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5484 .addReg(PhiReg)
5485 .add(*Val)
5486 .addReg(SGPRIdxReg)
5487 .addImm(SubReg);
5488 } else {
5489 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5490 TRI.getRegSizeInBits(*VecRC), 32, false);
5491 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5492 .addReg(PhiReg)
5493 .add(*Val)
5494 .addImm(SubReg);
5495 }
5496
5497 MI.eraseFromParent();
5498 return LoopBB;
5499}
5500
5502 MachineBasicBlock *BB) {
5503 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5504 // For GFX12, we emit s_add_u64 and s_sub_u64.
5505 MachineFunction *MF = BB->getParent();
5506 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5507 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5509 const DebugLoc &DL = MI.getDebugLoc();
5510 MachineOperand &Dest = MI.getOperand(0);
5511 MachineOperand &Src0 = MI.getOperand(1);
5512 MachineOperand &Src1 = MI.getOperand(2);
5513 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5514 if (ST.hasScalarAddSub64()) {
5515 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5516 // clang-format off
5517 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5518 .add(Src0)
5519 .add(Src1);
5520 // clang-format on
5521 } else {
5522 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5523 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5524
5525 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5526 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5527
5528 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5529 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5530 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5531 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5532
5533 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5534 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5535 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5536 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5537
5538 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5539 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5540 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5541 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5542 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5543 .addReg(DestSub0)
5544 .addImm(AMDGPU::sub0)
5545 .addReg(DestSub1)
5546 .addImm(AMDGPU::sub1);
5547 }
5548 MI.eraseFromParent();
5549 return BB;
5550}
5551
5553 switch (Opc) {
5554 case AMDGPU::S_MIN_U32:
5555 return std::numeric_limits<uint32_t>::max();
5556 case AMDGPU::S_MIN_I32:
5557 return std::numeric_limits<int32_t>::max();
5558 case AMDGPU::S_MAX_U32:
5559 return std::numeric_limits<uint32_t>::min();
5560 case AMDGPU::S_MAX_I32:
5561 return std::numeric_limits<int32_t>::min();
5562 case AMDGPU::V_ADD_F32_e64: // -0.0
5563 return 0x80000000;
5564 case AMDGPU::V_SUB_F32_e64: // +0.0
5565 return 0x0;
5566 case AMDGPU::S_ADD_I32:
5567 case AMDGPU::S_SUB_I32:
5568 case AMDGPU::S_OR_B32:
5569 case AMDGPU::S_XOR_B32:
5570 return std::numeric_limits<uint32_t>::min();
5571 case AMDGPU::S_AND_B32:
5572 return std::numeric_limits<uint32_t>::max();
5573 case AMDGPU::V_MIN_F32_e64:
5574 case AMDGPU::V_MAX_F32_e64:
5575 return 0x7fc00000; // qNAN
5576 default:
5578 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5579 }
5580}
5581
5583 switch (Opc) {
5584 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5585 return std::numeric_limits<uint64_t>::max();
5586 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5587 return std::numeric_limits<int64_t>::max();
5588 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5589 return std::numeric_limits<uint64_t>::min();
5590 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5591 return std::numeric_limits<int64_t>::min();
5592 case AMDGPU::V_MIN_F64_e64:
5593 case AMDGPU::V_MAX_F64_e64:
5594 case AMDGPU::V_MIN_NUM_F64_e64:
5595 case AMDGPU::V_MAX_NUM_F64_e64:
5596 return 0x7FF8000000000000; // qNAN
5597 case AMDGPU::S_ADD_U64_PSEUDO:
5598 case AMDGPU::S_SUB_U64_PSEUDO:
5599 case AMDGPU::S_OR_B64:
5600 case AMDGPU::S_XOR_B64:
5601 return std::numeric_limits<uint64_t>::min();
5602 case AMDGPU::S_AND_B64:
5603 return std::numeric_limits<uint64_t>::max();
5604 case AMDGPU::V_ADD_F64_e64:
5605 case AMDGPU::V_ADD_F64_pseudo_e64:
5606 return 0x8000000000000000; // -0.0
5607 default:
5609 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5610 }
5611}
5612
5613static bool is32bitWaveReduceOperation(unsigned Opc) {
5614 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5615 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5616 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5617 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5618 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5619 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5620 Opc == AMDGPU::V_SUB_F32_e64;
5621}
5622
5624 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5625 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5626 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5627 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5628 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5629}
5630
5633 const GCNSubtarget &ST,
5634 unsigned Opc) {
5636 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5637 const DebugLoc &DL = MI.getDebugLoc();
5638 const SIInstrInfo *TII = ST.getInstrInfo();
5639
5640 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5641 Register SrcReg = MI.getOperand(1).getReg();
5642 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5643 Register DstReg = MI.getOperand(0).getReg();
5644 MachineBasicBlock *RetBB = nullptr;
5645 if (isSGPR) {
5646 switch (Opc) {
5647 case AMDGPU::S_MIN_U32:
5648 case AMDGPU::S_MIN_I32:
5649 case AMDGPU::V_MIN_F32_e64:
5650 case AMDGPU::S_MAX_U32:
5651 case AMDGPU::S_MAX_I32:
5652 case AMDGPU::V_MAX_F32_e64:
5653 case AMDGPU::S_AND_B32:
5654 case AMDGPU::S_OR_B32: {
5655 // Idempotent operations.
5656 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5657 RetBB = &BB;
5658 break;
5659 }
5660 case AMDGPU::V_CMP_LT_U64_e64: // umin
5661 case AMDGPU::V_CMP_LT_I64_e64: // min
5662 case AMDGPU::V_CMP_GT_U64_e64: // umax
5663 case AMDGPU::V_CMP_GT_I64_e64: // max
5664 case AMDGPU::V_MIN_F64_e64:
5665 case AMDGPU::V_MIN_NUM_F64_e64:
5666 case AMDGPU::V_MAX_F64_e64:
5667 case AMDGPU::V_MAX_NUM_F64_e64:
5668 case AMDGPU::S_AND_B64:
5669 case AMDGPU::S_OR_B64: {
5670 // Idempotent operations.
5671 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5672 RetBB = &BB;
5673 break;
5674 }
5675 case AMDGPU::S_XOR_B32:
5676 case AMDGPU::S_XOR_B64:
5677 case AMDGPU::S_ADD_I32:
5678 case AMDGPU::S_ADD_U64_PSEUDO:
5679 case AMDGPU::V_ADD_F32_e64:
5680 case AMDGPU::V_ADD_F64_e64:
5681 case AMDGPU::V_ADD_F64_pseudo_e64:
5682 case AMDGPU::S_SUB_I32:
5683 case AMDGPU::S_SUB_U64_PSEUDO:
5684 case AMDGPU::V_SUB_F32_e64: {
5685 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5686 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5687 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5688 Register NumActiveLanes =
5689 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5690
5691 bool IsWave32 = ST.isWave32();
5692 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5693 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5694 unsigned BitCountOpc =
5695 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5696
5697 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5698
5699 auto NewAccumulator =
5700 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5701 .addReg(ExecMask);
5702
5703 switch (Opc) {
5704 case AMDGPU::S_XOR_B32:
5705 case AMDGPU::S_XOR_B64: {
5706 // Performing an XOR operation on a uniform value
5707 // depends on the parity of the number of active lanes.
5708 // For even parity, the result will be 0, for odd
5709 // parity the result will be the same as the input value.
5710 Register ParityRegister =
5711 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5712
5713 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5714 .addReg(NewAccumulator->getOperand(0).getReg())
5715 .addImm(1)
5716 .setOperandDead(3); // Dead scc
5717 if (Opc == AMDGPU::S_XOR_B32) {
5718 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5719 .addReg(SrcReg)
5720 .addReg(ParityRegister);
5721 } else {
5722 Register DestSub0 =
5723 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5724 Register DestSub1 =
5725 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5726
5727 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5728 const TargetRegisterClass *SrcSubRC =
5729 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5730
5731 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5732 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5733 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5734 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5735
5736 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5737 .add(Op1L)
5738 .addReg(ParityRegister);
5739
5740 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5741 .add(Op1H)
5742 .addReg(ParityRegister);
5743
5744 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5745 .addReg(DestSub0)
5746 .addImm(AMDGPU::sub0)
5747 .addReg(DestSub1)
5748 .addImm(AMDGPU::sub1);
5749 }
5750 break;
5751 }
5752 case AMDGPU::S_SUB_I32: {
5753 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5754
5755 // Take the negation of the source operand.
5756 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5757 .addImm(0)
5758 .addReg(SrcReg);
5759 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5760 .addReg(NegatedVal)
5761 .addReg(NewAccumulator->getOperand(0).getReg());
5762 break;
5763 }
5764 case AMDGPU::S_ADD_I32: {
5765 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5766 .addReg(SrcReg)
5767 .addReg(NewAccumulator->getOperand(0).getReg());
5768 break;
5769 }
5770 case AMDGPU::S_ADD_U64_PSEUDO:
5771 case AMDGPU::S_SUB_U64_PSEUDO: {
5772 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5773 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5774 Register Op1H_Op0L_Reg =
5775 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5776 Register Op1L_Op0H_Reg =
5777 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5778 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5779 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5780 Register NegatedValLo =
5781 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5782 Register NegatedValHi =
5783 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5784
5785 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5786 const TargetRegisterClass *Src1SubRC =
5787 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5788
5789 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5790 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5791 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5792 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5793
5794 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5795 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5796 .addImm(0)
5797 .addReg(NewAccumulator->getOperand(0).getReg())
5798 .setOperandDead(3); // Dead scc
5799 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5800 .addReg(NegatedValLo)
5801 .addImm(31)
5802 .setOperandDead(3); // Dead scc
5803 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5804 .add(Op1L)
5805 .addReg(NegatedValHi);
5806 }
5807 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5808 ? NegatedValLo
5809 : NewAccumulator->getOperand(0).getReg();
5810 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5811 .add(Op1L)
5812 .addReg(LowOpcode);
5813 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5814 .add(Op1L)
5815 .addReg(LowOpcode);
5816 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5817 .add(Op1H)
5818 .addReg(LowOpcode);
5819
5820 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5821 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5822 .addReg(CarryReg)
5823 .addReg(Op1H_Op0L_Reg)
5824 .setOperandDead(3); // Dead scc
5825
5826 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5827 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5828 .addReg(HiVal)
5829 .addReg(Op1L_Op0H_Reg)
5830 .setOperandDead(3); // Dead scc
5831 }
5832 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5833 .addReg(DestSub0)
5834 .addImm(AMDGPU::sub0)
5835 .addReg(DestSub1)
5836 .addImm(AMDGPU::sub1);
5837 break;
5838 }
5839 case AMDGPU::V_ADD_F32_e64:
5840 case AMDGPU::V_ADD_F64_e64:
5841 case AMDGPU::V_ADD_F64_pseudo_e64:
5842 case AMDGPU::V_SUB_F32_e64: {
5843 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5844 const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0);
5845 Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);
5846 Register DstVreg = MRI.createVirtualRegister(VregRC);
5847 // Get number of active lanes as a float val.
5848 BuildMI(BB, MI, DL,
5849 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5850 : AMDGPU::V_CVT_F64_I32_e64),
5851 ActiveLanesVreg)
5852 .addReg(NewAccumulator->getOperand(0).getReg())
5853 .addImm(0) // clamp
5854 .addImm(0); // output-modifier
5855
5856 // Take negation of input for SUB reduction
5857 unsigned srcMod =
5858 (Opc == AMDGPU::V_SUB_F32_e64 ||
5859 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5862 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5863 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
5864 ? AMDGPU::V_MUL_F64_pseudo_e64
5865 : AMDGPU::V_MUL_F64_e64;
5866 auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),
5867 DstVreg)
5868 .addImm(srcMod) // src0 modifier
5869 .addReg(SrcReg)
5870 .addImm(SISrcMods::NONE) // src1 modifier
5871 .addReg(ActiveLanesVreg)
5872 .addImm(SISrcMods::NONE) // clamp
5873 .addImm(SISrcMods::NONE); // output-mod
5874 if (is32BitOpc) {
5875 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5876 .addReg(DstVreg);
5877 } else {
5878 Register LaneValueLoReg =
5879 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5880 Register LaneValueHiReg =
5881 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5882 const TargetRegisterClass *VregSubRC =
5883 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5884 MachineOperand Op1L =
5885 TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
5886 VregRC, AMDGPU::sub0, VregSubRC);
5887 MachineOperand Op1H =
5888 TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
5889 VregRC, AMDGPU::sub1, VregSubRC);
5890 // lane value input should be in an sgpr
5891 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
5892 LaneValueLoReg)
5893 .add(Op1L);
5894 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
5895 LaneValueHiReg)
5896 .add(Op1H);
5897 NewAccumulator =
5898 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5899 .addReg(LaneValueLoReg)
5900 .addImm(AMDGPU::sub0)
5901 .addReg(LaneValueHiReg)
5902 .addImm(AMDGPU::sub1);
5903 }
5904 }
5905 }
5906 RetBB = &BB;
5907 }
5908 }
5909 } else {
5910 // TODO: Implement DPP Strategy and switch based on immediate strategy
5911 // operand. For now, for all the cases (default, Iterative and DPP we use
5912 // iterative approach by default.)
5913
5914 // To reduce the VGPR using iterative approach, we need to iterate
5915 // over all the active lanes. Lowering consists of ComputeLoop,
5916 // which iterate over only active lanes. We use copy of EXEC register
5917 // as induction variable and every active lane modifies it using bitset0
5918 // so that we will get the next active lane for next iteration.
5920 Register SrcReg = MI.getOperand(1).getReg();
5921 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5923
5924 // Create Control flow for loop
5925 // Split MI's Machine Basic block into For loop
5926 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5927
5928 // Create virtual registers required for lowering.
5929 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5930 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5931 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5932 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5933 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5934 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5935 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5936 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5937 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5938
5939 bool IsWave32 = ST.isWave32();
5940 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5941 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5942
5943 // Create initial values of induction variable from Exec, Accumulator and
5944 // insert branch instr to newly created ComputeBlock
5945 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5946 if (is32BitOpc) {
5948 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5949 .addImm(IdentityValue);
5950 } else {
5951 uint64_t IdentityValue =
5952 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5953 ? 0x0 // +0.0 for double sub reduction
5955 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5956 .addImm(IdentityValue);
5957 }
5958 // clang-format off
5959 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5960 .addMBB(ComputeLoop);
5961 // clang-format on
5962
5963 // Start constructing ComputeLoop
5964 I = ComputeLoop->begin();
5965 auto Accumulator =
5966 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5967 .addReg(IdentityValReg)
5968 .addMBB(&BB);
5969 auto ActiveBits =
5970 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5971 .addReg(LoopIterator)
5972 .addMBB(&BB);
5973
5974 I = ComputeLoop->end();
5975 MachineInstr *NewAccumulator;
5976 // Perform the computations
5977 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5978 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5979 .addReg(ActiveBitsReg);
5980 if (is32BitOpc) {
5981 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5982 LaneValueReg)
5983 .addReg(SrcReg)
5984 .addReg(FF1Reg);
5985 if (isFPOp) {
5986 Register LaneValVreg =
5987 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5988 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5989 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5990 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5991 LaneValVreg)
5992 .addReg(LaneValueReg);
5993 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5994 .addImm(0) // src0 modifier
5995 .addReg(Accumulator->getOperand(0).getReg())
5996 .addImm(0) // src1 modifier
5997 .addReg(LaneValVreg)
5998 .addImm(0) // clamp
5999 .addImm(0); // omod
6000 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6001 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6002 .addReg(DstVreg);
6003 } else {
6004 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6005 .addReg(Accumulator->getOperand(0).getReg())
6006 .addReg(LaneValueReg);
6007 }
6008 } else {
6009 Register LaneValueLoReg =
6010 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6011 Register LaneValueHiReg =
6012 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6013 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6014 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
6015 const TargetRegisterClass *SrcSubRC =
6016 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
6017 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
6018 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
6019 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
6020 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
6021 // lane value input should be in an sgpr
6022 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6023 LaneValueLoReg)
6024 .add(Op1L)
6025 .addReg(FF1Reg);
6026 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6027 LaneValueHiReg)
6028 .add(Op1H)
6029 .addReg(FF1Reg);
6030 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
6031 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
6032 .addReg(LaneValueLoReg)
6033 .addImm(AMDGPU::sub0)
6034 .addReg(LaneValueHiReg)
6035 .addImm(AMDGPU::sub1);
6036 switch (Opc) {
6037 case AMDGPU::S_OR_B64:
6038 case AMDGPU::S_AND_B64:
6039 case AMDGPU::S_XOR_B64: {
6040 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6041 .addReg(Accumulator->getOperand(0).getReg())
6042 .addReg(LaneValue->getOperand(0).getReg())
6043 .setOperandDead(3); // Dead scc
6044 break;
6045 }
6046 case AMDGPU::V_CMP_GT_I64_e64:
6047 case AMDGPU::V_CMP_GT_U64_e64:
6048 case AMDGPU::V_CMP_LT_I64_e64:
6049 case AMDGPU::V_CMP_LT_U64_e64: {
6050 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
6051 Register ComparisonResultReg =
6052 MRI.createVirtualRegister(WaveMaskRegClass);
6053 int SrcIdx =
6054 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6055 const TargetRegisterClass *VregClass =
6056 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6057 const TargetRegisterClass *VSubRegClass =
6058 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6059 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
6060 MachineOperand SrcReg0Sub0 =
6061 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
6062 VregClass, AMDGPU::sub0, VSubRegClass);
6063 MachineOperand SrcReg0Sub1 =
6064 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
6065 VregClass, AMDGPU::sub1, VSubRegClass);
6066 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
6067 AccumulatorVReg)
6068 .add(SrcReg0Sub0)
6069 .addImm(AMDGPU::sub0)
6070 .add(SrcReg0Sub1)
6071 .addImm(AMDGPU::sub1);
6072 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
6073 .addReg(LaneValue->getOperand(0).getReg())
6074 .addReg(AccumulatorVReg);
6075
6076 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6077 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
6078 .addReg(LaneMaskReg)
6079 .addReg(ActiveBitsReg);
6080
6081 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6082 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6083 .addReg(LaneValue->getOperand(0).getReg())
6084 .addReg(Accumulator->getOperand(0).getReg());
6085 break;
6086 }
6087 case AMDGPU::V_MIN_F64_e64:
6088 case AMDGPU::V_MIN_NUM_F64_e64:
6089 case AMDGPU::V_MAX_F64_e64:
6090 case AMDGPU::V_MAX_NUM_F64_e64:
6091 case AMDGPU::V_ADD_F64_e64:
6092 case AMDGPU::V_ADD_F64_pseudo_e64: {
6093 int SrcIdx =
6094 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6095 const TargetRegisterClass *VregRC =
6096 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6097 const TargetRegisterClass *VregSubRC =
6098 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6099 Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
6100 Register DstVreg = MRI.createVirtualRegister(VregRC);
6101 Register LaneValLo =
6102 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6103 Register LaneValHi =
6104 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6105 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
6106 .addReg(Accumulator->getOperand(0).getReg());
6107 unsigned Modifier =
6108 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6111 auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6112 .addImm(Modifier) // src0 modifiers
6113 .addReg(LaneValue->getOperand(0).getReg())
6114 .addImm(SISrcMods::NONE) // src1 modifiers
6115 .addReg(AccumulatorVReg)
6116 .addImm(SISrcMods::NONE) // clamp
6117 .addImm(SISrcMods::NONE); // omod
6118 auto ReadLaneLo =
6119 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6120 LaneValLo);
6121 auto ReadLaneHi =
6122 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6123 LaneValHi);
6124 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6125 MachineOperand Op1L =
6126 TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
6127 VregRC, AMDGPU::sub0, VregSubRC);
6128 MachineOperand Op1H =
6129 TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
6130 VregRC, AMDGPU::sub1, VregSubRC);
6131 ReadLaneLo.add(Op1L);
6132 ReadLaneHi.add(Op1H);
6133 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6134 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
6135 .addReg(LaneValLo)
6136 .addImm(AMDGPU::sub0)
6137 .addReg(LaneValHi)
6138 .addImm(AMDGPU::sub1);
6139 break;
6140 }
6141 case AMDGPU::S_ADD_U64_PSEUDO:
6142 case AMDGPU::S_SUB_U64_PSEUDO: {
6143 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6144 .addReg(Accumulator->getOperand(0).getReg())
6145 .addReg(LaneValue->getOperand(0).getReg());
6146 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
6147 break;
6148 }
6149 }
6150 }
6151 // Manipulate the iterator to get the next active lane
6152 unsigned BITSETOpc =
6153 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6154 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
6155 .addReg(FF1Reg)
6156 .addReg(ActiveBitsReg);
6157
6158 // Add phi nodes
6159 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
6160 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6161
6162 // Creating branching
6163 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6164 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
6165 .addReg(NewActiveBitsReg)
6166 .addImm(0);
6167 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6168 .addMBB(ComputeLoop);
6169
6170 RetBB = ComputeEnd;
6171 }
6172 MI.eraseFromParent();
6173 return RetBB;
6174}
6175
6178 MachineBasicBlock *BB) const {
6179 MachineFunction *MF = BB->getParent();
6181 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6183 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6185 const DebugLoc &DL = MI.getDebugLoc();
6186
6187 switch (MI.getOpcode()) {
6188 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6189 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
6190 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6191 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
6192 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6193 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
6194 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6195 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6196 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6197 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6198 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6199 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6200 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6201 ? AMDGPU::V_MIN_NUM_F64_e64
6202 : AMDGPU::V_MIN_F64_e64);
6203 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6204 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6205 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6206 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6207 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6208 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6209 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6210 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6211 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6212 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6213 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6214 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6215 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6216 ? AMDGPU::V_MAX_NUM_F64_e64
6217 : AMDGPU::V_MAX_F64_e64);
6218 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6219 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6220 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6221 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6222 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6223 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6224 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6225 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6226 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6227 ? AMDGPU::V_ADD_F64_pseudo_e64
6228 : AMDGPU::V_ADD_F64_e64);
6229 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6230 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6231 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6232 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6233 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6234 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6235 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6236 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6237 // fadd + neg, by setting the NEG bit in the instruction.
6238 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6239 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6240 ? AMDGPU::V_ADD_F64_pseudo_e64
6241 : AMDGPU::V_ADD_F64_e64);
6242 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6243 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6244 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6245 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6246 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6247 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6248 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6249 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6250 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6251 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6252 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6253 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6254 case AMDGPU::S_UADDO_PSEUDO:
6255 case AMDGPU::S_USUBO_PSEUDO: {
6256 MachineOperand &Dest0 = MI.getOperand(0);
6257 MachineOperand &Dest1 = MI.getOperand(1);
6258 MachineOperand &Src0 = MI.getOperand(2);
6259 MachineOperand &Src1 = MI.getOperand(3);
6260
6261 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6262 ? AMDGPU::S_ADD_U32
6263 : AMDGPU::S_SUB_U32;
6264 // clang-format off
6265 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6266 .add(Src0)
6267 .add(Src1);
6268 // clang-format on
6269
6270 unsigned SelOpc =
6271 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6272 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6273
6274 MI.eraseFromParent();
6275 return BB;
6276 }
6277 case AMDGPU::S_ADD_U64_PSEUDO:
6278 case AMDGPU::S_SUB_U64_PSEUDO: {
6279 return Expand64BitScalarArithmetic(MI, BB);
6280 }
6281 case AMDGPU::V_ADD_U64_PSEUDO:
6282 case AMDGPU::V_SUB_U64_PSEUDO: {
6283 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6284
6285 MachineOperand &Dest = MI.getOperand(0);
6286 MachineOperand &Src0 = MI.getOperand(1);
6287 MachineOperand &Src1 = MI.getOperand(2);
6288
6289 if (ST.hasAddSubU64Insts()) {
6290 auto I = BuildMI(*BB, MI, DL,
6291 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6292 : AMDGPU::V_SUB_U64_e64),
6293 Dest.getReg())
6294 .add(Src0)
6295 .add(Src1)
6296 .addImm(0); // clamp
6297 TII->legalizeOperands(*I);
6298 MI.eraseFromParent();
6299 return BB;
6300 }
6301
6302 if (IsAdd && ST.hasLshlAddU64Inst()) {
6303 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6304 Dest.getReg())
6305 .add(Src0)
6306 .addImm(0)
6307 .add(Src1);
6308 TII->legalizeOperands(*Add);
6309 MI.eraseFromParent();
6310 return BB;
6311 }
6312
6313 const auto *CarryRC = TRI->getWaveMaskRegClass();
6314
6315 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6316 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6317
6318 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6319 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6320
6321 const TargetRegisterClass *Src0RC = Src0.isReg()
6322 ? MRI.getRegClass(Src0.getReg())
6323 : &AMDGPU::VReg_64RegClass;
6324 const TargetRegisterClass *Src1RC = Src1.isReg()
6325 ? MRI.getRegClass(Src1.getReg())
6326 : &AMDGPU::VReg_64RegClass;
6327
6328 const TargetRegisterClass *Src0SubRC =
6329 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6330 const TargetRegisterClass *Src1SubRC =
6331 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6332
6333 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6334 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6335 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6336 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6337
6338 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6339 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6340 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6341 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6342
6343 unsigned LoOpc =
6344 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6345 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6346 .addReg(CarryReg, RegState::Define)
6347 .add(SrcReg0Sub0)
6348 .add(SrcReg1Sub0)
6349 .addImm(0); // clamp bit
6350
6351 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6352 MachineInstr *HiHalf =
6353 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6354 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6355 .add(SrcReg0Sub1)
6356 .add(SrcReg1Sub1)
6357 .addReg(CarryReg, RegState::Kill)
6358 .addImm(0); // clamp bit
6359
6360 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6361 .addReg(DestSub0)
6362 .addImm(AMDGPU::sub0)
6363 .addReg(DestSub1)
6364 .addImm(AMDGPU::sub1);
6365 TII->legalizeOperands(*LoHalf);
6366 TII->legalizeOperands(*HiHalf);
6367 MI.eraseFromParent();
6368 return BB;
6369 }
6370 case AMDGPU::S_ADD_CO_PSEUDO:
6371 case AMDGPU::S_SUB_CO_PSEUDO: {
6372 // This pseudo has a chance to be selected
6373 // only from uniform add/subcarry node. All the VGPR operands
6374 // therefore assumed to be splat vectors.
6376 MachineOperand &Dest = MI.getOperand(0);
6377 MachineOperand &CarryDest = MI.getOperand(1);
6378 MachineOperand &Src0 = MI.getOperand(2);
6379 MachineOperand &Src1 = MI.getOperand(3);
6380 MachineOperand &Src2 = MI.getOperand(4);
6381 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6382 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6383 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6384 .addReg(Src0.getReg());
6385 Src0.setReg(RegOp0);
6386 }
6387 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6388 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6389 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6390 .addReg(Src1.getReg());
6391 Src1.setReg(RegOp1);
6392 }
6393 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6394 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6395 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6396 .addReg(Src2.getReg());
6397 Src2.setReg(RegOp2);
6398 }
6399
6400 if (ST.isWave64()) {
6401 if (ST.hasScalarCompareEq64()) {
6402 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6403 .addReg(Src2.getReg())
6404 .addImm(0);
6405 } else {
6406 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6407 const TargetRegisterClass *SubRC =
6408 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6409 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6410 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6411 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6412 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6413 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6414
6415 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6416 .add(Src2Sub0)
6417 .add(Src2Sub1);
6418
6419 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6420 .addReg(Src2_32, RegState::Kill)
6421 .addImm(0);
6422 }
6423 } else {
6424 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6425 .addReg(Src2.getReg())
6426 .addImm(0);
6427 }
6428
6429 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6430 ? AMDGPU::S_ADDC_U32
6431 : AMDGPU::S_SUBB_U32;
6432
6433 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6434
6435 unsigned SelOpc =
6436 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6437
6438 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6439 .addImm(-1)
6440 .addImm(0);
6441
6442 MI.eraseFromParent();
6443 return BB;
6444 }
6445 case AMDGPU::SI_INIT_M0: {
6446 MachineOperand &M0Init = MI.getOperand(0);
6447 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6448 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6449 AMDGPU::M0)
6450 .add(M0Init);
6451 MI.eraseFromParent();
6452 return BB;
6453 }
6454 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6455 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6456 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6457 TII->get(AMDGPU::S_CMP_EQ_U32))
6458 .addImm(0)
6459 .addImm(0);
6460 return BB;
6461 }
6462 case AMDGPU::GET_GROUPSTATICSIZE: {
6463 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6464 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6465 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6466 .add(MI.getOperand(0))
6467 .addImm(MFI->getLDSSize());
6468 MI.eraseFromParent();
6469 return BB;
6470 }
6471 case AMDGPU::GET_SHADERCYCLESHILO: {
6472 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
6473 // The algorithm is:
6474 //
6475 // hi1 = getreg(SHADER_CYCLES_HI)
6476 // lo1 = getreg(SHADER_CYCLES_LO)
6477 // hi2 = getreg(SHADER_CYCLES_HI)
6478 //
6479 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6480 // Otherwise there was overflow and the result is hi2:0. In both cases the
6481 // result should represent the actual time at some point during the sequence
6482 // of three getregs.
6483 using namespace AMDGPU::Hwreg;
6484 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6485 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6486 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6487 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6488 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6489 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6490 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6491 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6492 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6493 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6494 .addReg(RegHi1)
6495 .addReg(RegHi2);
6496 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6497 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6498 .addReg(RegLo1)
6499 .addImm(0);
6500 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6501 .add(MI.getOperand(0))
6502 .addReg(RegLo)
6503 .addImm(AMDGPU::sub0)
6504 .addReg(RegHi2)
6505 .addImm(AMDGPU::sub1);
6506 MI.eraseFromParent();
6507 return BB;
6508 }
6509 case AMDGPU::SI_INDIRECT_SRC_V1:
6510 case AMDGPU::SI_INDIRECT_SRC_V2:
6511 case AMDGPU::SI_INDIRECT_SRC_V3:
6512 case AMDGPU::SI_INDIRECT_SRC_V4:
6513 case AMDGPU::SI_INDIRECT_SRC_V5:
6514 case AMDGPU::SI_INDIRECT_SRC_V6:
6515 case AMDGPU::SI_INDIRECT_SRC_V7:
6516 case AMDGPU::SI_INDIRECT_SRC_V8:
6517 case AMDGPU::SI_INDIRECT_SRC_V9:
6518 case AMDGPU::SI_INDIRECT_SRC_V10:
6519 case AMDGPU::SI_INDIRECT_SRC_V11:
6520 case AMDGPU::SI_INDIRECT_SRC_V12:
6521 case AMDGPU::SI_INDIRECT_SRC_V16:
6522 case AMDGPU::SI_INDIRECT_SRC_V32:
6523 return emitIndirectSrc(MI, *BB, *getSubtarget());
6524 case AMDGPU::SI_INDIRECT_DST_V1:
6525 case AMDGPU::SI_INDIRECT_DST_V2:
6526 case AMDGPU::SI_INDIRECT_DST_V3:
6527 case AMDGPU::SI_INDIRECT_DST_V4:
6528 case AMDGPU::SI_INDIRECT_DST_V5:
6529 case AMDGPU::SI_INDIRECT_DST_V6:
6530 case AMDGPU::SI_INDIRECT_DST_V7:
6531 case AMDGPU::SI_INDIRECT_DST_V8:
6532 case AMDGPU::SI_INDIRECT_DST_V9:
6533 case AMDGPU::SI_INDIRECT_DST_V10:
6534 case AMDGPU::SI_INDIRECT_DST_V11:
6535 case AMDGPU::SI_INDIRECT_DST_V12:
6536 case AMDGPU::SI_INDIRECT_DST_V16:
6537 case AMDGPU::SI_INDIRECT_DST_V32:
6538 return emitIndirectDst(MI, *BB, *getSubtarget());
6539 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6540 case AMDGPU::SI_KILL_I1_PSEUDO:
6541 return splitKillBlock(MI, BB);
6542 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6543 Register Dst = MI.getOperand(0).getReg();
6544 const MachineOperand &Src0 = MI.getOperand(1);
6545 const MachineOperand &Src1 = MI.getOperand(2);
6546 Register SrcCond = MI.getOperand(3).getReg();
6547
6548 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6549 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6550 const auto *CondRC = TRI->getWaveMaskRegClass();
6551 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6552
6553 const TargetRegisterClass *Src0RC = Src0.isReg()
6554 ? MRI.getRegClass(Src0.getReg())
6555 : &AMDGPU::VReg_64RegClass;
6556 const TargetRegisterClass *Src1RC = Src1.isReg()
6557 ? MRI.getRegClass(Src1.getReg())
6558 : &AMDGPU::VReg_64RegClass;
6559
6560 const TargetRegisterClass *Src0SubRC =
6561 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6562 const TargetRegisterClass *Src1SubRC =
6563 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6564
6565 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6566 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6567 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6568 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6569
6570 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6571 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6572 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6573 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6574
6575 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6576 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6577 .addImm(0)
6578 .add(Src0Sub0)
6579 .addImm(0)
6580 .add(Src1Sub0)
6581 .addReg(SrcCondCopy);
6582 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6583 .addImm(0)
6584 .add(Src0Sub1)
6585 .addImm(0)
6586 .add(Src1Sub1)
6587 .addReg(SrcCondCopy);
6588
6589 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6590 .addReg(DstLo)
6591 .addImm(AMDGPU::sub0)
6592 .addReg(DstHi)
6593 .addImm(AMDGPU::sub1);
6594 MI.eraseFromParent();
6595 return BB;
6596 }
6597 case AMDGPU::SI_BR_UNDEF: {
6598 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6599 .add(MI.getOperand(0));
6600 Br->getOperand(1).setIsUndef(); // read undef SCC
6601 MI.eraseFromParent();
6602 return BB;
6603 }
6604 case AMDGPU::ADJCALLSTACKUP:
6605 case AMDGPU::ADJCALLSTACKDOWN: {
6607 MachineInstrBuilder MIB(*MF, &MI);
6608 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6609 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6610 return BB;
6611 }
6612 case AMDGPU::SI_CALL_ISEL: {
6613 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6614
6616 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6617
6618 for (const MachineOperand &MO : MI.operands())
6619 MIB.add(MO);
6620
6621 MIB.cloneMemRefs(MI);
6622 MI.eraseFromParent();
6623 return BB;
6624 }
6625 case AMDGPU::V_ADD_CO_U32_e32:
6626 case AMDGPU::V_SUB_CO_U32_e32:
6627 case AMDGPU::V_SUBREV_CO_U32_e32: {
6628 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6629 unsigned Opc = MI.getOpcode();
6630
6631 bool NeedClampOperand = false;
6632 if (TII->pseudoToMCOpcode(Opc) == -1) {
6634 NeedClampOperand = true;
6635 }
6636
6637 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6638 if (TII->isVOP3(*I)) {
6639 I.addReg(TRI->getVCC(), RegState::Define);
6640 }
6641 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6642 if (NeedClampOperand)
6643 I.addImm(0); // clamp bit for e64 encoding
6644
6645 TII->legalizeOperands(*I);
6646
6647 MI.eraseFromParent();
6648 return BB;
6649 }
6650 case AMDGPU::V_ADDC_U32_e32:
6651 case AMDGPU::V_SUBB_U32_e32:
6652 case AMDGPU::V_SUBBREV_U32_e32:
6653 // These instructions have an implicit use of vcc which counts towards the
6654 // constant bus limit.
6655 TII->legalizeOperands(MI);
6656 return BB;
6657 case AMDGPU::DS_GWS_INIT:
6658 case AMDGPU::DS_GWS_SEMA_BR:
6659 case AMDGPU::DS_GWS_BARRIER:
6660 case AMDGPU::DS_GWS_SEMA_V:
6661 case AMDGPU::DS_GWS_SEMA_P:
6662 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6663 // A s_waitcnt 0 is required to be the instruction immediately following.
6664 if (getSubtarget()->hasGWSAutoReplay()) {
6666 return BB;
6667 }
6668
6669 return emitGWSMemViolTestLoop(MI, BB);
6670 case AMDGPU::S_SETREG_B32: {
6671 // Try to optimize cases that only set the denormal mode or rounding mode.
6672 //
6673 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6674 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6675 // instead.
6676 //
6677 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6678 // allow you to have a no side effect instruction in the output of a
6679 // sideeffecting pattern.
6680 auto [ID, Offset, Width] =
6681 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6683 return BB;
6684
6685 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6686 const unsigned SetMask = WidthMask << Offset;
6687
6688 if (getSubtarget()->hasDenormModeInst()) {
6689 unsigned SetDenormOp = 0;
6690 unsigned SetRoundOp = 0;
6691
6692 // The dedicated instructions can only set the whole denorm or round mode
6693 // at once, not a subset of bits in either.
6694 if (SetMask ==
6696 // If this fully sets both the round and denorm mode, emit the two
6697 // dedicated instructions for these.
6698 SetRoundOp = AMDGPU::S_ROUND_MODE;
6699 SetDenormOp = AMDGPU::S_DENORM_MODE;
6700 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6701 SetRoundOp = AMDGPU::S_ROUND_MODE;
6702 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6703 SetDenormOp = AMDGPU::S_DENORM_MODE;
6704 }
6705
6706 if (SetRoundOp || SetDenormOp) {
6707 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6708 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6709 unsigned ImmVal = Def->getOperand(1).getImm();
6710 if (SetRoundOp) {
6711 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6712 .addImm(ImmVal & 0xf);
6713
6714 // If we also have the denorm mode, get just the denorm mode bits.
6715 ImmVal >>= 4;
6716 }
6717
6718 if (SetDenormOp) {
6719 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6720 .addImm(ImmVal & 0xf);
6721 }
6722
6723 MI.eraseFromParent();
6724 return BB;
6725 }
6726 }
6727 }
6728
6729 // If only FP bits are touched, used the no side effects pseudo.
6730 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6731 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6732 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6733
6734 return BB;
6735 }
6736 case AMDGPU::S_INVERSE_BALLOT_U32:
6737 case AMDGPU::S_INVERSE_BALLOT_U64:
6738 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6739 // necessary. After that they are equivalent to a COPY.
6740 MI.setDesc(TII->get(AMDGPU::COPY));
6741 return BB;
6742 case AMDGPU::ENDPGM_TRAP: {
6743 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6744 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6745 MI.addOperand(MachineOperand::CreateImm(0));
6746 return BB;
6747 }
6748
6749 // We need a block split to make the real endpgm a terminator. We also don't
6750 // want to break phis in successor blocks, so we can't just delete to the
6751 // end of the block.
6752
6753 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6755 MF->push_back(TrapBB);
6756 // clang-format off
6757 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6758 .addImm(0);
6759 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6760 .addMBB(TrapBB);
6761 // clang-format on
6762
6763 BB->addSuccessor(TrapBB);
6764 MI.eraseFromParent();
6765 return SplitBB;
6766 }
6767 case AMDGPU::SIMULATED_TRAP: {
6768 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6769 MachineBasicBlock *SplitBB =
6770 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6771 MI.eraseFromParent();
6772 return SplitBB;
6773 }
6774 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6775 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6777
6778 // During ISel, it's difficult to propagate the original EXEC mask to use as
6779 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6780 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6781 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6782 Register OriginalExec = Setup->getOperand(0).getReg();
6783 MF->getRegInfo().clearKillFlags(OriginalExec);
6784 MI.getOperand(0).setReg(OriginalExec);
6785 return BB;
6786 }
6787 default:
6788 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6789 if (!MI.mayStore())
6791 return BB;
6792 }
6794 }
6795}
6796
6798 // This currently forces unfolding various combinations of fsub into fma with
6799 // free fneg'd operands. As long as we have fast FMA (controlled by
6800 // isFMAFasterThanFMulAndFAdd), we should perform these.
6801
6802 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6803 // most of these combines appear to be cycle neutral but save on instruction
6804 // count / code size.
6805 return true;
6806}
6807
6809
6811 EVT VT) const {
6812 if (!VT.isVector()) {
6813 return MVT::i1;
6814 }
6815 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6816}
6817
6819 // TODO: Should i16 be used always if legal? For now it would force VALU
6820 // shifts.
6821 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6822}
6823
6825 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6826 ? Ty.changeElementSize(16)
6827 : Ty.changeElementSize(32);
6828}
6829
6830// Answering this is somewhat tricky and depends on the specific device which
6831// have different rates for fma or all f64 operations.
6832//
6833// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6834// regardless of which device (although the number of cycles differs between
6835// devices), so it is always profitable for f64.
6836//
6837// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6838// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6839// which we can always do even without fused FP ops since it returns the same
6840// result as the separate operations and since it is always full
6841// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6842// however does not support denormals, so we do report fma as faster if we have
6843// a fast fma device and require denormals.
6844//
6846 EVT VT) const {
6847 VT = VT.getScalarType();
6848
6849 switch (VT.getSimpleVT().SimpleTy) {
6850 case MVT::f32: {
6851 // If mad is not available this depends only on if f32 fma is full rate.
6852 if (!Subtarget->hasMadMacF32Insts())
6853 return Subtarget->hasFastFMAF32();
6854
6855 // Otherwise f32 mad is always full rate and returns the same result as
6856 // the separate operations so should be preferred over fma.
6857 // However does not support denormals.
6859 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6860
6861 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6862 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6863 }
6864 case MVT::f64:
6865 return true;
6866 case MVT::f16:
6867 case MVT::bf16:
6868 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6869 default:
6870 break;
6871 }
6872
6873 return false;
6874}
6875
6877 LLT Ty) const {
6878 switch (Ty.getScalarSizeInBits()) {
6879 case 16:
6880 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6881 case 32:
6882 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6883 case 64:
6884 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6885 default:
6886 break;
6887 }
6888
6889 return false;
6890}
6891
6893 if (!Ty.isScalar())
6894 return false;
6895
6896 if (Ty.getScalarSizeInBits() == 16)
6897 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6898 if (Ty.getScalarSizeInBits() == 32)
6899 return Subtarget->hasMadMacF32Insts() &&
6900 denormalModeIsFlushAllF32(*MI.getMF());
6901
6902 return false;
6903}
6904
6906 const SDNode *N) const {
6907 // TODO: Check future ftz flag
6908 // v_mad_f32/v_mac_f32 do not support denormals.
6909 EVT VT = N->getValueType(0);
6910 if (VT == MVT::f32)
6911 return Subtarget->hasMadMacF32Insts() &&
6913 if (VT == MVT::f16) {
6914 return Subtarget->hasMadF16() &&
6916 }
6917
6918 return false;
6919}
6920
6921//===----------------------------------------------------------------------===//
6922// Custom DAG Lowering Operations
6923//===----------------------------------------------------------------------===//
6924
6925// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6926// wider vector type is legal.
6928 SelectionDAG &DAG) const {
6929 unsigned Opc = Op.getOpcode();
6930 EVT VT = Op.getValueType();
6931 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6932 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6933 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6934 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6935 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6936 VT == MVT::v32bf16);
6937
6938 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6939
6940 SDLoc SL(Op);
6941 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6942 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6943
6944 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6945}
6946
6947// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6948// regression whereby extra unnecessary instructions were added to codegen
6949// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6950// instructions to extract the result from the vector.
6952 [[maybe_unused]] EVT VT = Op.getValueType();
6953
6954 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6955 VT == MVT::v16i32) &&
6956 "Unexpected ValueType.");
6957
6958 return DAG.UnrollVectorOp(Op.getNode());
6959}
6960
6961// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6962// wider vector type is legal.
6964 SelectionDAG &DAG) const {
6965 unsigned Opc = Op.getOpcode();
6966 EVT VT = Op.getValueType();
6967 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6968 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6969 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6970 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6971 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6972 VT == MVT::v32bf16);
6973
6974 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6975 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6976
6977 SDLoc SL(Op);
6978
6979 SDValue OpLo =
6980 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6981 SDValue OpHi =
6982 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6983
6984 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6985}
6986
6988 SelectionDAG &DAG) const {
6989 unsigned Opc = Op.getOpcode();
6990 EVT VT = Op.getValueType();
6991 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6992 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6993 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6994 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6995 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6996 VT == MVT::v32bf16);
6997
6998 SDValue Op0 = Op.getOperand(0);
6999 auto [Lo0, Hi0] = Op0.getValueType().isVector()
7000 ? DAG.SplitVectorOperand(Op.getNode(), 0)
7001 : std::pair(Op0, Op0);
7002
7003 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
7004 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
7005
7006 SDLoc SL(Op);
7007 auto ResVT = DAG.GetSplitDestVTs(VT);
7008
7009 SDValue OpLo =
7010 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
7011 SDValue OpHi =
7012 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
7013
7014 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7015}
7016
7018 switch (Op.getOpcode()) {
7019 default:
7021 case ISD::BRCOND:
7022 return LowerBRCOND(Op, DAG);
7023 case ISD::RETURNADDR:
7024 return LowerRETURNADDR(Op, DAG);
7025 case ISD::SPONENTRY:
7026 return LowerSPONENTRY(Op, DAG);
7027 case ISD::LOAD: {
7028 SDValue Result = LowerLOAD(Op, DAG);
7029 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7030 "Load should return a value and a chain");
7031 return Result;
7032 }
7033 case ISD::FSQRT: {
7034 EVT VT = Op.getValueType();
7035 if (VT == MVT::f32)
7036 return lowerFSQRTF32(Op, DAG);
7037 if (VT == MVT::f64)
7038 return lowerFSQRTF64(Op, DAG);
7039 return SDValue();
7040 }
7041 case ISD::FSIN:
7042 case ISD::FCOS:
7043 return LowerTrig(Op, DAG);
7044 case ISD::SELECT:
7045 return LowerSELECT(Op, DAG);
7046 case ISD::FDIV:
7047 return LowerFDIV(Op, DAG);
7048 case ISD::FFREXP:
7049 return LowerFFREXP(Op, DAG);
7051 return LowerATOMIC_CMP_SWAP(Op, DAG);
7052 case ISD::STORE:
7053 return LowerSTORE(Op, DAG);
7054 case ISD::GlobalAddress: {
7057 return LowerGlobalAddress(MFI, Op, DAG);
7058 }
7060 return LowerExternalSymbol(Op, DAG);
7062 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7064 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7066 return LowerINTRINSIC_VOID(Op, DAG);
7067 case ISD::ADDRSPACECAST:
7068 return lowerADDRSPACECAST(Op, DAG);
7070 return lowerINSERT_SUBVECTOR(Op, DAG);
7072 return lowerINSERT_VECTOR_ELT(Op, DAG);
7074 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7076 return lowerVECTOR_SHUFFLE(Op, DAG);
7078 return lowerSCALAR_TO_VECTOR(Op, DAG);
7079 case ISD::BUILD_VECTOR:
7080 return lowerBUILD_VECTOR(Op, DAG);
7081 case ISD::FP_ROUND:
7083 return lowerFP_ROUND(Op, DAG);
7084 case ISD::TRAP:
7085 return lowerTRAP(Op, DAG);
7086 case ISD::DEBUGTRAP:
7087 return lowerDEBUGTRAP(Op, DAG);
7088 case ISD::ABS:
7089 case ISD::FABS:
7090 case ISD::FNEG:
7091 case ISD::FCANONICALIZE:
7092 case ISD::BSWAP:
7093 return splitUnaryVectorOp(Op, DAG);
7094 case ISD::FMINNUM:
7095 case ISD::FMAXNUM:
7096 return lowerFMINNUM_FMAXNUM(Op, DAG);
7097 case ISD::FMINIMUMNUM:
7098 case ISD::FMAXIMUMNUM:
7099 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7100 case ISD::FMINIMUM:
7101 case ISD::FMAXIMUM:
7102 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7103 case ISD::FLDEXP:
7104 case ISD::STRICT_FLDEXP:
7105 return lowerFLDEXP(Op, DAG);
7106 case ISD::FMA:
7107 return splitTernaryVectorOp(Op, DAG);
7108 case ISD::FP_TO_SINT:
7109 case ISD::FP_TO_UINT:
7110 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
7111 Op.getValueType() == MVT::i16 &&
7112 Op.getOperand(0).getValueType() == MVT::f32) {
7113 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7114 return Op;
7115 }
7116 return LowerFP_TO_INT(Op, DAG);
7117 case ISD::SHL:
7118 case ISD::SRA:
7119 case ISD::SRL:
7120 case ISD::ADD:
7121 case ISD::SUB:
7122 case ISD::SMIN:
7123 case ISD::SMAX:
7124 case ISD::UMIN:
7125 case ISD::UMAX:
7126 case ISD::FADD:
7127 case ISD::FMUL:
7128 case ISD::FMINNUM_IEEE:
7129 case ISD::FMAXNUM_IEEE:
7130 case ISD::UADDSAT:
7131 case ISD::USUBSAT:
7132 case ISD::SADDSAT:
7133 case ISD::SSUBSAT:
7134 return splitBinaryVectorOp(Op, DAG);
7135 case ISD::FCOPYSIGN:
7136 return lowerFCOPYSIGN(Op, DAG);
7137 case ISD::MUL:
7138 return lowerMUL(Op, DAG);
7139 case ISD::SMULO:
7140 case ISD::UMULO:
7141 return lowerXMULO(Op, DAG);
7142 case ISD::SMUL_LOHI:
7143 case ISD::UMUL_LOHI:
7144 return lowerXMUL_LOHI(Op, DAG);
7146 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7147 case ISD::STACKSAVE:
7148 return LowerSTACKSAVE(Op, DAG);
7149 case ISD::GET_ROUNDING:
7150 return lowerGET_ROUNDING(Op, DAG);
7151 case ISD::SET_ROUNDING:
7152 return lowerSET_ROUNDING(Op, DAG);
7153 case ISD::PREFETCH:
7154 return lowerPREFETCH(Op, DAG);
7155 case ISD::FP_EXTEND:
7157 return lowerFP_EXTEND(Op, DAG);
7158 case ISD::GET_FPENV:
7159 return lowerGET_FPENV(Op, DAG);
7160 case ISD::SET_FPENV:
7161 return lowerSET_FPENV(Op, DAG);
7162 case ISD::ROTR:
7163 return lowerROTR(Op, DAG);
7164 }
7165 return SDValue();
7166}
7167
7168// Used for D16: Casts the result of an instruction into the right vector,
7169// packs values if loads return unpacked values.
7171 const SDLoc &DL, SelectionDAG &DAG,
7172 bool Unpacked) {
7173 if (!LoadVT.isVector())
7174 return Result;
7175
7176 // Cast back to the original packed type or to a larger type that is a
7177 // multiple of 32 bit for D16. Widening the return type is a required for
7178 // legalization.
7179 EVT FittingLoadVT = LoadVT;
7180 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7181 FittingLoadVT =
7183 LoadVT.getVectorNumElements() + 1);
7184 }
7185
7186 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7187 // Truncate to v2i16/v4i16.
7188 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7189
7190 // Workaround legalizer not scalarizing truncate after vector op
7191 // legalization but not creating intermediate vector trunc.
7193 DAG.ExtractVectorElements(Result, Elts);
7194 for (SDValue &Elt : Elts)
7195 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7196
7197 // Pad illegal v1i16/v3fi6 to v4i16
7198 if ((LoadVT.getVectorNumElements() % 2) == 1)
7199 Elts.push_back(DAG.getPOISON(MVT::i16));
7200
7201 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
7202
7203 // Bitcast to original type (v2f16/v4f16).
7204 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7205 }
7206
7207 // Cast back to the original packed type.
7208 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7209}
7210
7211SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7212 SelectionDAG &DAG,
7214 bool IsIntrinsic) const {
7215 SDLoc DL(M);
7216
7217 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7218 EVT LoadVT = M->getValueType(0);
7219
7220 EVT EquivLoadVT = LoadVT;
7221 if (LoadVT.isVector()) {
7222 if (Unpacked) {
7223 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7224 LoadVT.getVectorNumElements());
7225 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7226 // Widen v3f16 to legal type
7227 EquivLoadVT =
7229 LoadVT.getVectorNumElements() + 1);
7230 }
7231 }
7232
7233 // Change from v4f16/v2f16 to EquivLoadVT.
7234 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7235
7237 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7238 M->getMemoryVT(), M->getMemOperand());
7239
7240 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7241
7242 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7243}
7244
7245SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7246 SelectionDAG &DAG,
7247 ArrayRef<SDValue> Ops) const {
7248 SDLoc DL(M);
7249 EVT LoadVT = M->getValueType(0);
7250 EVT EltType = LoadVT.getScalarType();
7251 EVT IntVT = LoadVT.changeTypeToInteger();
7252
7253 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7254
7255 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7256 bool IsTFE = M->getNumValues() == 3;
7257
7258 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7259 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7260 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7261 : AMDGPUISD::BUFFER_LOAD;
7262
7263 if (IsD16) {
7264 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7265 }
7266
7267 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7268 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7269 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7270 IsTFE);
7271
7272 if (isTypeLegal(LoadVT)) {
7273 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7274 M->getMemOperand(), DAG);
7275 }
7276
7277 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7278 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7279 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7280 M->getMemOperand(), DAG);
7281 return DAG.getMergeValues(
7282 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7283 DL);
7284}
7285
7287 SelectionDAG &DAG) {
7288 EVT VT = N->getValueType(0);
7289 unsigned CondCode = N->getConstantOperandVal(3);
7290 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7291 return DAG.getPOISON(VT);
7292
7293 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7294
7295 SDValue LHS = N->getOperand(1);
7296 SDValue RHS = N->getOperand(2);
7297
7298 SDLoc DL(N);
7299
7300 EVT CmpVT = LHS.getValueType();
7301 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7302 unsigned PromoteOp =
7304 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7305 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7306 }
7307
7308 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7309
7310 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7311 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7312
7313 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7314 DAG.getCondCode(CCOpcode));
7315 if (VT.bitsEq(CCVT))
7316 return SetCC;
7317 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7318}
7319
7321 SelectionDAG &DAG) {
7322 EVT VT = N->getValueType(0);
7323
7324 unsigned CondCode = N->getConstantOperandVal(3);
7325 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7326 return DAG.getPOISON(VT);
7327
7328 SDValue Src0 = N->getOperand(1);
7329 SDValue Src1 = N->getOperand(2);
7330 EVT CmpVT = Src0.getValueType();
7331 SDLoc SL(N);
7332
7333 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7334 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7335 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7336 }
7337
7338 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7339 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7340 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7341 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7342 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7343 DAG.getCondCode(CCOpcode));
7344 if (VT.bitsEq(CCVT))
7345 return SetCC;
7346 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7347}
7348
7350 SelectionDAG &DAG) {
7351 EVT VT = N->getValueType(0);
7352 SDValue Src = N->getOperand(1);
7353 SDLoc SL(N);
7354
7355 if (Src.getOpcode() == ISD::SETCC) {
7356 SDValue Op0 = Src.getOperand(0);
7357 SDValue Op1 = Src.getOperand(1);
7358 // Need to expand bfloat to float for comparison (setcc).
7359 if (Op0.getValueType() == MVT::bf16) {
7360 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7361 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7362 }
7363 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7364 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7365 }
7366 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7367 // (ballot 0) -> 0
7368 if (Arg->isZero())
7369 return DAG.getConstant(0, SL, VT);
7370
7371 // (ballot 1) -> EXEC/EXEC_LO
7372 if (Arg->isOne()) {
7373 Register Exec;
7374 if (VT.getScalarSizeInBits() == 32)
7375 Exec = AMDGPU::EXEC_LO;
7376 else if (VT.getScalarSizeInBits() == 64)
7377 Exec = AMDGPU::EXEC;
7378 else
7379 return SDValue();
7380
7381 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7382 }
7383 }
7384
7385 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7386 // ISD::SETNE)
7387 return DAG.getNode(
7388 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7389 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7390}
7391
7393 SelectionDAG &DAG) {
7394 EVT VT = N->getValueType(0);
7395 unsigned ValSize = VT.getSizeInBits();
7396 unsigned IID = N->getConstantOperandVal(0);
7397 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7398 IID == Intrinsic::amdgcn_permlanex16;
7399 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7400 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7401 SDLoc SL(N);
7402 MVT IntVT = MVT::getIntegerVT(ValSize);
7403 const GCNSubtarget *ST = TLI.getSubtarget();
7404 unsigned SplitSize = 32;
7405 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7406 ST->hasDPALU_DPP() &&
7407 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7408 SplitSize = 64;
7409
7410 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7411 SDValue Src2, MVT ValT) -> SDValue {
7412 SmallVector<SDValue, 8> Operands;
7413 switch (IID) {
7414 case Intrinsic::amdgcn_permlane16:
7415 case Intrinsic::amdgcn_permlanex16:
7416 case Intrinsic::amdgcn_update_dpp:
7417 Operands.push_back(N->getOperand(6));
7418 Operands.push_back(N->getOperand(5));
7419 Operands.push_back(N->getOperand(4));
7420 [[fallthrough]];
7421 case Intrinsic::amdgcn_writelane:
7422 Operands.push_back(Src2);
7423 [[fallthrough]];
7424 case Intrinsic::amdgcn_readlane:
7425 case Intrinsic::amdgcn_set_inactive:
7426 case Intrinsic::amdgcn_set_inactive_chain_arg:
7427 case Intrinsic::amdgcn_mov_dpp8:
7428 Operands.push_back(Src1);
7429 [[fallthrough]];
7430 case Intrinsic::amdgcn_readfirstlane:
7431 case Intrinsic::amdgcn_permlane64:
7432 Operands.push_back(Src0);
7433 break;
7434 default:
7435 llvm_unreachable("unhandled lane op");
7436 }
7437
7438 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7439 std::reverse(Operands.begin(), Operands.end());
7440
7441 if (SDNode *GL = N->getGluedNode()) {
7442 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7443 GL = GL->getOperand(0).getNode();
7444 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7445 SDValue(GL, 0)));
7446 }
7447
7448 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7449 };
7450
7451 SDValue Src0 = N->getOperand(1);
7452 SDValue Src1, Src2;
7453 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7454 IID == Intrinsic::amdgcn_mov_dpp8 ||
7455 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7456 Src1 = N->getOperand(2);
7457 if (IID == Intrinsic::amdgcn_writelane ||
7458 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7459 Src2 = N->getOperand(3);
7460 }
7461
7462 if (ValSize == SplitSize) {
7463 // Already legal
7464 return SDValue();
7465 }
7466
7467 if (ValSize < 32) {
7468 bool IsFloat = VT.isFloatingPoint();
7469 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7470 SL, MVT::i32);
7471
7472 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7473 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7474 SL, MVT::i32);
7475 }
7476
7477 if (IID == Intrinsic::amdgcn_writelane) {
7478 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7479 SL, MVT::i32);
7480 }
7481
7482 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7483 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7484 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7485 }
7486
7487 if (ValSize % SplitSize != 0)
7488 return SDValue();
7489
7490 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7491 EVT VT = N->getValueType(0);
7492 unsigned NE = VT.getVectorNumElements();
7493 EVT EltVT = VT.getVectorElementType();
7495 unsigned NumOperands = N->getNumOperands();
7496 SmallVector<SDValue, 4> Operands(NumOperands);
7497 SDNode *GL = N->getGluedNode();
7498
7499 // only handle convergencectrl_glue
7501
7502 for (unsigned i = 0; i != NE; ++i) {
7503 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7504 ++j) {
7505 SDValue Operand = N->getOperand(j);
7506 EVT OperandVT = Operand.getValueType();
7507 if (OperandVT.isVector()) {
7508 // A vector operand; extract a single element.
7509 EVT OperandEltVT = OperandVT.getVectorElementType();
7510 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7511 Operand, DAG.getVectorIdxConstant(i, SL));
7512 } else {
7513 // A scalar operand; just use it as is.
7514 Operands[j] = Operand;
7515 }
7516 }
7517
7518 if (GL)
7519 Operands[NumOperands - 1] =
7520 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7521 SDValue(GL->getOperand(0).getNode(), 0));
7522
7523 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7524 }
7525
7526 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7527 return DAG.getBuildVector(VecVT, SL, Scalars);
7528 };
7529
7530 if (VT.isVector()) {
7531 switch (MVT::SimpleValueType EltTy =
7533 case MVT::i32:
7534 case MVT::f32:
7535 if (SplitSize == 32) {
7536 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7537 return unrollLaneOp(LaneOp.getNode());
7538 }
7539 [[fallthrough]];
7540 case MVT::i16:
7541 case MVT::f16:
7542 case MVT::bf16: {
7543 unsigned SubVecNumElt =
7544 SplitSize / VT.getVectorElementType().getSizeInBits();
7545 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7547 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7548 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7549 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7550 DAG.getConstant(EltIdx, SL, MVT::i32));
7551
7552 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7553 IsPermLane16)
7554 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7555 DAG.getConstant(EltIdx, SL, MVT::i32));
7556
7557 if (IID == Intrinsic::amdgcn_writelane)
7558 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7559 DAG.getConstant(EltIdx, SL, MVT::i32));
7560
7561 Pieces.push_back(
7562 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7563 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7564 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7565 EltIdx += SubVecNumElt;
7566 }
7567 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7568 }
7569 default:
7570 // Handle all other cases by bitcasting to i32 vectors
7571 break;
7572 }
7573 }
7574
7575 MVT VecVT =
7576 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7577 Src0 = DAG.getBitcast(VecVT, Src0);
7578
7579 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7580 Src1 = DAG.getBitcast(VecVT, Src1);
7581
7582 if (IID == Intrinsic::amdgcn_writelane)
7583 Src2 = DAG.getBitcast(VecVT, Src2);
7584
7585 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7586 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7587 return DAG.getBitcast(VT, UnrolledLaneOp);
7588}
7589
7591 SelectionDAG &DAG) {
7592 EVT VT = N->getValueType(0);
7593
7594 if (VT.getSizeInBits() != 32)
7595 return SDValue();
7596
7597 SDLoc SL(N);
7598
7599 SDValue Value = N->getOperand(1);
7600 SDValue Index = N->getOperand(2);
7601
7602 // ds_bpermute requires index to be multiplied by 4
7603 SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
7604 SDValue ShiftedIndex =
7605 DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
7606
7607 // Intrinsics will require i32 to operate on
7608 SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
7609
7610 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7611 SmallVector<SDValue> IntrinArgs) -> SDValue {
7612 SmallVector<SDValue> Operands(1);
7613 Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
7614 Operands.append(IntrinArgs);
7615 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
7616 };
7617
7618 // If we can bpermute across the whole wave, then just do that
7620 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7621 {ShiftedIndex, ValueI32});
7622 return DAG.getBitcast(VT, BPermute);
7623 }
7624
7625 assert(TLI.getSubtarget()->isWave64());
7626
7627 // Otherwise, we need to make use of whole wave mode
7628 SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
7629
7630 // Set inactive lanes to poison
7631 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7632 {ValueI32, PoisonVal});
7633 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7634 {ShiftedIndex, PoisonVal});
7635
7636 SDValue Swapped =
7637 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7638
7639 // Get permutation of each half, then we'll select which one to use
7640 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7641 {WWMIndex, WWMValue});
7642 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7643 MVT::i32, {WWMIndex, Swapped});
7644 SDValue BPermOtherHalfWWM =
7645 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7646
7647 // Select which side to take the permute from
7648 SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
7649 // We can get away with only using mbcnt_lo here since we're only
7650 // trying to detect which side of 32 each lane is on, and mbcnt_lo
7651 // returns 32 for lanes 32-63.
7652 SDValue ThreadID =
7653 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7654 {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7655
7656 SDValue SameOrOtherHalf =
7657 DAG.getNode(ISD::AND, SL, MVT::i32,
7658 DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7659 DAG.getTargetConstant(32, SL, MVT::i32));
7660 SDValue UseSameHalf =
7661 DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7662 DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7663 SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7664 BPermOtherHalfWWM);
7665 return DAG.getBitcast(VT, Result);
7666}
7667
7670 SelectionDAG &DAG) const {
7671 switch (N->getOpcode()) {
7673 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7674 Results.push_back(Res);
7675 return;
7676 }
7678 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7679 Results.push_back(Res);
7680 return;
7681 }
7683 unsigned IID = N->getConstantOperandVal(0);
7684 switch (IID) {
7685 case Intrinsic::amdgcn_make_buffer_rsrc:
7686 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7687 return;
7688 case Intrinsic::amdgcn_cvt_pkrtz: {
7689 SDValue Src0 = N->getOperand(1);
7690 SDValue Src1 = N->getOperand(2);
7691 SDLoc SL(N);
7692 SDValue Cvt =
7693 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7694 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7695 return;
7696 }
7697 case Intrinsic::amdgcn_cvt_pknorm_i16:
7698 case Intrinsic::amdgcn_cvt_pknorm_u16:
7699 case Intrinsic::amdgcn_cvt_pk_i16:
7700 case Intrinsic::amdgcn_cvt_pk_u16: {
7701 SDValue Src0 = N->getOperand(1);
7702 SDValue Src1 = N->getOperand(2);
7703 SDLoc SL(N);
7704 unsigned Opcode;
7705
7706 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7707 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7708 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7709 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7710 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7711 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7712 else
7713 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7714
7715 EVT VT = N->getValueType(0);
7716 if (isTypeLegal(VT))
7717 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7718 else {
7719 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7720 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7721 }
7722 return;
7723 }
7724 case Intrinsic::amdgcn_s_buffer_load: {
7725 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7726 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7727 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7728 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7729 // s_buffer_load_i8.
7730 if (!Subtarget->hasScalarSubwordLoads())
7731 return;
7732 SDValue Op = SDValue(N, 0);
7733 SDValue Rsrc = Op.getOperand(1);
7734 SDValue Offset = Op.getOperand(2);
7735 SDValue CachePolicy = Op.getOperand(3);
7736 EVT VT = Op.getValueType();
7737 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7738 SDLoc DL(Op);
7740 const DataLayout &DataLayout = DAG.getDataLayout();
7741 Align Alignment =
7747 VT.getStoreSize(), Alignment);
7748 SDValue LoadVal;
7749 if (!Offset->isDivergent()) {
7750 SDValue Ops[] = {Rsrc, // source register
7751 Offset, CachePolicy};
7752 SDValue BufferLoad =
7753 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7754 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7755 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7756 } else {
7757 SDValue Ops[] = {
7758 DAG.getEntryNode(), // Chain
7759 Rsrc, // rsrc
7760 DAG.getConstant(0, DL, MVT::i32), // vindex
7761 {}, // voffset
7762 {}, // soffset
7763 {}, // offset
7764 CachePolicy, // cachepolicy
7765 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7766 };
7767 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7768 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7769 }
7770 Results.push_back(LoadVal);
7771 return;
7772 }
7773 case Intrinsic::amdgcn_dead: {
7774 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7775 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7776 return;
7777 }
7778 }
7779 break;
7780 }
7782 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7783 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7784 // FIXME: Hacky
7785 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7786 Results.push_back(Res.getOperand(I));
7787 }
7788 } else {
7789 Results.push_back(Res);
7790 Results.push_back(Res.getValue(1));
7791 }
7792 return;
7793 }
7794
7795 break;
7796 }
7797 case ISD::SELECT: {
7798 SDLoc SL(N);
7799 EVT VT = N->getValueType(0);
7800 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7801 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7802 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7803
7804 EVT SelectVT = NewVT;
7805 if (NewVT.bitsLT(MVT::i32)) {
7806 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7807 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7808 SelectVT = MVT::i32;
7809 }
7810
7811 SDValue NewSelect =
7812 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7813
7814 if (NewVT != SelectVT)
7815 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7816 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7817 return;
7818 }
7819 case ISD::FNEG: {
7820 if (N->getValueType(0) != MVT::v2f16)
7821 break;
7822
7823 SDLoc SL(N);
7824 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7825
7826 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7827 DAG.getConstant(0x80008000, SL, MVT::i32));
7828 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7829 return;
7830 }
7831 case ISD::FABS: {
7832 if (N->getValueType(0) != MVT::v2f16)
7833 break;
7834
7835 SDLoc SL(N);
7836 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7837
7838 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7839 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7840 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7841 return;
7842 }
7843 case ISD::FSQRT: {
7844 if (N->getValueType(0) != MVT::f16)
7845 break;
7846 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7847 break;
7848 }
7849 default:
7851 break;
7852 }
7853}
7854
7855/// Helper function for LowerBRCOND
7856static SDNode *findUser(SDValue Value, unsigned Opcode) {
7857
7858 for (SDUse &U : Value->uses()) {
7859 if (U.get() != Value)
7860 continue;
7861
7862 if (U.getUser()->getOpcode() == Opcode)
7863 return U.getUser();
7864 }
7865 return nullptr;
7866}
7867
7868unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7869 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7870 switch (Intr->getConstantOperandVal(1)) {
7871 case Intrinsic::amdgcn_if:
7872 return AMDGPUISD::IF;
7873 case Intrinsic::amdgcn_else:
7874 return AMDGPUISD::ELSE;
7875 case Intrinsic::amdgcn_loop:
7876 return AMDGPUISD::LOOP;
7877 case Intrinsic::amdgcn_end_cf:
7878 llvm_unreachable("should not occur");
7879 default:
7880 return 0;
7881 }
7882 }
7883
7884 // break, if_break, else_break are all only used as inputs to loop, not
7885 // directly as branch conditions.
7886 return 0;
7887}
7888
7895
7897 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7898 return false;
7899
7900 // FIXME: Either avoid relying on address space here or change the default
7901 // address space for functions to avoid the explicit check.
7902 return (GV->getValueType()->isFunctionTy() ||
7905}
7906
7908 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7909}
7910
7912 if (!GV->hasExternalLinkage())
7913 return true;
7914
7915 const auto OS = getTargetMachine().getTargetTriple().getOS();
7916 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7917}
7918
7919/// This transforms the control flow intrinsics to get the branch destination as
7920/// last parameter, also switches branch target with BR if the need arise
7921SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7922 SDLoc DL(BRCOND);
7923
7924 SDNode *Intr = BRCOND.getOperand(1).getNode();
7925 SDValue Target = BRCOND.getOperand(2);
7926 SDNode *BR = nullptr;
7927 SDNode *SetCC = nullptr;
7928
7929 switch (Intr->getOpcode()) {
7930 case ISD::SETCC: {
7931 // As long as we negate the condition everything is fine
7932 SetCC = Intr;
7933 Intr = SetCC->getOperand(0).getNode();
7934 break;
7935 }
7936 case ISD::XOR: {
7937 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7938 SDValue LHS = Intr->getOperand(0);
7939 SDValue RHS = Intr->getOperand(1);
7940 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7941 Intr = LHS.getNode();
7942 break;
7943 }
7944 [[fallthrough]];
7945 }
7946 default: {
7947 // Get the target from BR if we don't negate the condition
7948 BR = findUser(BRCOND, ISD::BR);
7949 assert(BR && "brcond missing unconditional branch user");
7950 Target = BR->getOperand(1);
7951 }
7952 }
7953
7954 unsigned CFNode = isCFIntrinsic(Intr);
7955 if (CFNode == 0) {
7956 // This is a uniform branch so we don't need to legalize.
7957 return BRCOND;
7958 }
7959
7960 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7962
7963 assert(!SetCC ||
7964 (SetCC->getConstantOperandVal(1) == 1 &&
7965 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7966 ISD::SETNE));
7967
7968 // operands of the new intrinsic call
7970 if (HaveChain)
7971 Ops.push_back(BRCOND.getOperand(0));
7972
7973 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7974 Ops.push_back(Target);
7975
7976 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7977
7978 // build the new intrinsic call
7979 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7980
7981 if (!HaveChain) {
7982 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7983
7985 }
7986
7987 if (BR) {
7988 // Give the branch instruction our target
7989 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7990 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7991 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7992 }
7993
7994 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7995
7996 // Copy the intrinsic results to registers
7997 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7998 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7999 if (!CopyToReg)
8000 continue;
8001
8002 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
8003 SDValue(Result, i - 1), SDValue());
8004
8005 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
8006 }
8007
8008 // Remove the old intrinsic from the chain
8009 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
8010 Intr->getOperand(0));
8011
8012 return Chain;
8013}
8014
8015SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
8016 MVT VT = Op.getSimpleValueType();
8017 SDLoc DL(Op);
8018 // Checking the depth
8019 if (Op.getConstantOperandVal(0) != 0)
8020 return DAG.getConstant(0, DL, VT);
8021
8022 MachineFunction &MF = DAG.getMachineFunction();
8023 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8024 // Check for kernel and shader functions
8025 if (Info->isEntryFunction())
8026 return DAG.getConstant(0, DL, VT);
8027
8028 MachineFrameInfo &MFI = MF.getFrameInfo();
8029 // There is a call to @llvm.returnaddress in this function
8030 MFI.setReturnAddressIsTaken(true);
8031
8032 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
8033 // Get the return address reg and mark it as an implicit live-in
8034 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
8035 getRegClassFor(VT, Op.getNode()->isDivergent()));
8036
8037 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
8038}
8039
8040SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8041 MachineFunction &MF = DAG.getMachineFunction();
8042 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8043
8044 // For functions that set up their own stack, select the GET_STACK_BASE
8045 // pseudo.
8046 if (MFI->isBottomOfStack())
8047 return Op;
8048
8049 // For everything else, create a dummy stack object.
8050 int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);
8051 return DAG.getFrameIndex(FI, Op.getValueType());
8052}
8053
8054SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8055 const SDLoc &DL, EVT VT) const {
8056 return Op.getValueType().bitsLE(VT)
8057 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
8058 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
8059 DAG.getTargetConstant(0, DL, MVT::i32));
8060}
8061
8062SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8063 SelectionDAG &DAG) const {
8064 EVT DstVT = Op.getValueType();
8065 unsigned NumElts = DstVT.getVectorNumElements();
8066 assert(NumElts > 2 && isPowerOf2_32(NumElts));
8067
8068 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
8069
8070 SDLoc DL(Op);
8071 unsigned Opc = Op.getOpcode();
8072 SDValue Flags = Op.getOperand(1);
8073 EVT HalfDstVT =
8074 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
8075 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
8076 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
8077
8078 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
8079}
8080
8081SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8082 SDValue Src = Op.getOperand(0);
8083 EVT SrcVT = Src.getValueType();
8084 EVT DstVT = Op.getValueType();
8085
8086 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
8087 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8088 if (SrcVT.getScalarType() != MVT::f32)
8089 return SDValue();
8090 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8091 }
8092
8093 if (SrcVT.getScalarType() != MVT::f64)
8094 return Op;
8095
8096 SDLoc DL(Op);
8097 if (DstVT == MVT::f16) {
8098 // TODO: Handle strictfp
8099 if (Op.getOpcode() != ISD::FP_ROUND)
8100 return Op;
8101
8102 if (!Subtarget->has16BitInsts()) {
8103 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
8104 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8105 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8106 }
8107 if (Op->getFlags().hasApproximateFuncs()) {
8108 SDValue Flags = Op.getOperand(1);
8109 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
8110 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
8111 }
8112 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8113 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8114 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8115 }
8116
8117 assert(DstVT.getScalarType() == MVT::bf16 &&
8118 "custom lower FP_ROUND for f16 or bf16");
8119 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8120
8121 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8122 // hardware f32 -> bf16 instruction.
8123 EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
8124 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
8125 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
8126 DAG.getTargetConstant(0, DL, MVT::i32));
8127}
8128
8129SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8130 SelectionDAG &DAG) const {
8131 EVT VT = Op.getValueType();
8132 const MachineFunction &MF = DAG.getMachineFunction();
8133 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8134 bool IsIEEEMode = Info->getMode().IEEE;
8135
8136 // FIXME: Assert during selection that this is only selected for
8137 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8138 // mode functions, but this happens to be OK since it's only done in cases
8139 // where there is known no sNaN.
8140 if (IsIEEEMode)
8141 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
8142
8143 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8144 VT == MVT::v16bf16)
8145 return splitBinaryVectorOp(Op, DAG);
8146 return Op;
8147}
8148
8149SDValue
8150SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8151 SelectionDAG &DAG) const {
8152 EVT VT = Op.getValueType();
8153 const MachineFunction &MF = DAG.getMachineFunction();
8154 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8155 bool IsIEEEMode = Info->getMode().IEEE;
8156
8157 if (IsIEEEMode)
8158 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
8159
8160 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8161 VT == MVT::v16bf16)
8162 return splitBinaryVectorOp(Op, DAG);
8163 return Op;
8164}
8165
8166SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8167 SelectionDAG &DAG) const {
8168 EVT VT = Op.getValueType();
8169 if (VT.isVector())
8170 return splitBinaryVectorOp(Op, DAG);
8171
8172 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8173 !Subtarget->hasMinimum3Maximum3F16() &&
8174 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8175 "should not need to widen f16 minimum/maximum to v2f16");
8176
8177 // Widen f16 operation to v2f16
8178
8179 // fminimum f16:x, f16:y ->
8180 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8181 // (v2f16 (scalar_to_vector y))), 0
8182 SDLoc SL(Op);
8183 SDValue WideSrc0 =
8184 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
8185 SDValue WideSrc1 =
8186 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
8187
8188 SDValue Widened =
8189 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8190
8191 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
8192 DAG.getConstant(0, SL, MVT::i32));
8193}
8194
8195SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8196 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8197 EVT VT = Op.getValueType();
8198 assert(VT == MVT::f16);
8199
8200 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
8201 EVT ExpVT = Exp.getValueType();
8202 if (ExpVT == MVT::i16)
8203 return Op;
8204
8205 SDLoc DL(Op);
8206
8207 // Correct the exponent type for f16 to i16.
8208 // Clamp the range of the exponent to the instruction's range.
8209
8210 // TODO: This should be a generic narrowing legalization, and can easily be
8211 // for GlobalISel.
8212
8213 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
8214 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
8215
8216 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
8217 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
8218
8219 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
8220
8221 if (IsStrict) {
8222 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
8223 {Op.getOperand(0), Op.getOperand(1), TruncExp});
8224 }
8225
8226 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
8227}
8228
8230 switch (Op->getOpcode()) {
8231 case ISD::SRA:
8232 case ISD::SMIN:
8233 case ISD::SMAX:
8234 return ISD::SIGN_EXTEND;
8235 case ISD::SRL:
8236 case ISD::UMIN:
8237 case ISD::UMAX:
8238 return ISD::ZERO_EXTEND;
8239 case ISD::ADD:
8240 case ISD::SUB:
8241 case ISD::AND:
8242 case ISD::OR:
8243 case ISD::XOR:
8244 case ISD::SHL:
8245 case ISD::SELECT:
8246 case ISD::MUL:
8247 // operation result won't be influenced by garbage high bits.
8248 // TODO: are all of those cases correct, and are there more?
8249 return ISD::ANY_EXTEND;
8250 case ISD::SETCC: {
8251 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8253 }
8254 default:
8255 llvm_unreachable("unexpected opcode!");
8256 }
8257}
8258
8259SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8260 DAGCombinerInfo &DCI) const {
8261 const unsigned Opc = Op.getOpcode();
8262 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8263 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8264 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8265 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8266 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8267
8268 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8269 : Op->getOperand(0).getValueType();
8270 auto &DAG = DCI.DAG;
8271 auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8272
8273 if (DCI.isBeforeLegalizeOps() ||
8274 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8275 return SDValue();
8276
8277 SDLoc DL(Op);
8278 SDValue LHS;
8279 SDValue RHS;
8280 if (Opc == ISD::SELECT) {
8281 LHS = Op->getOperand(1);
8282 RHS = Op->getOperand(2);
8283 } else {
8284 LHS = Op->getOperand(0);
8285 RHS = Op->getOperand(1);
8286 }
8287
8288 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8289 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
8290
8291 // Special case: for shifts, the RHS always needs a zext.
8292 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8293 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
8294 else
8295 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
8296
8297 // setcc always return i1/i1 vec so no need to truncate after.
8298 if (Opc == ISD::SETCC) {
8299 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8300 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8301 }
8302
8303 // For other ops, we extend the operation's return type as well so we need to
8304 // truncate back to the original type.
8305 SDValue NewVal;
8306 if (Opc == ISD::SELECT)
8307 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8308 else
8309 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8310
8311 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8312}
8313
8314SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8315 SDValue Mag = Op.getOperand(0);
8316 EVT MagVT = Mag.getValueType();
8317
8318 if (MagVT.getVectorNumElements() > 2)
8319 return splitBinaryVectorOp(Op, DAG);
8320
8321 SDValue Sign = Op.getOperand(1);
8322 EVT SignVT = Sign.getValueType();
8323
8324 if (MagVT == SignVT)
8325 return Op;
8326
8327 // fcopysign v2f16:mag, v2f32:sign ->
8328 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8329
8330 SDLoc SL(Op);
8331 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8332 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8333
8334 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8335
8336 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8337}
8338
8339// Custom lowering for vector multiplications and s_mul_u64.
8340SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8341 EVT VT = Op.getValueType();
8342
8343 // Split vector operands.
8344 if (VT.isVector())
8345 return splitBinaryVectorOp(Op, DAG);
8346
8347 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8348
8349 // There are four ways to lower s_mul_u64:
8350 //
8351 // 1. If all the operands are uniform, then we lower it as it is.
8352 //
8353 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8354 // multiplications because there is not a vector equivalent of s_mul_u64.
8355 //
8356 // 3. If the cost model decides that it is more efficient to use vector
8357 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8358 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8359 //
8360 // 4. If the cost model decides to use vector registers and both of the
8361 // operands are zero-extended/sign-extended from 32-bits, then we split the
8362 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8363 // possible to check if the operands are zero-extended or sign-extended in
8364 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8365 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8366 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8367 // If the cost model decides that we have to use vector registers, then
8368 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8369 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8370 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8371 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8372 // SIInstrInfo.cpp .
8373
8374 if (Op->isDivergent())
8375 return SDValue();
8376
8377 SDValue Op0 = Op.getOperand(0);
8378 SDValue Op1 = Op.getOperand(1);
8379 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8380 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8381 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8382 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8383 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8384 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8385 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8386 SDLoc SL(Op);
8387 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8388 return SDValue(
8389 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8390 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8391 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8392 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8393 return SDValue(
8394 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8395 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8396 return Op;
8397}
8398
8399SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8400 EVT VT = Op.getValueType();
8401 SDLoc SL(Op);
8402 SDValue LHS = Op.getOperand(0);
8403 SDValue RHS = Op.getOperand(1);
8404 bool isSigned = Op.getOpcode() == ISD::SMULO;
8405
8406 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8407 const APInt &C = RHSC->getAPIntValue();
8408 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8409 if (C.isPowerOf2()) {
8410 // smulo(x, signed_min) is same as umulo(x, signed_min).
8411 bool UseArithShift = isSigned && !C.isMinSignedValue();
8412 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8413 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8414 SDValue Overflow =
8415 DAG.getSetCC(SL, MVT::i1,
8416 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8417 Result, ShiftAmt),
8418 LHS, ISD::SETNE);
8419 return DAG.getMergeValues({Result, Overflow}, SL);
8420 }
8421 }
8422
8423 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8424 SDValue Top =
8425 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8426
8427 SDValue Sign = isSigned
8428 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8429 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8430 SL, MVT::i32))
8431 : DAG.getConstant(0, SL, VT);
8432 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8433
8434 return DAG.getMergeValues({Result, Overflow}, SL);
8435}
8436
8437SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8438 if (Op->isDivergent()) {
8439 // Select to V_MAD_[IU]64_[IU]32.
8440 return Op;
8441 }
8442 if (Subtarget->hasSMulHi()) {
8443 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8444 return SDValue();
8445 }
8446 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8447 // calculate the high part, so we might as well do the whole thing with
8448 // V_MAD_[IU]64_[IU]32.
8449 return Op;
8450}
8451
8452SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8453 if (!Subtarget->hasTrapHandler() ||
8454 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8455 return lowerTrapEndpgm(Op, DAG);
8456
8457 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8458 : lowerTrapHsaQueuePtr(Op, DAG);
8459}
8460
8461SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8462 SDLoc SL(Op);
8463 SDValue Chain = Op.getOperand(0);
8464 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8465}
8466
8467SDValue
8468SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8469 const SDLoc &DL, Align Alignment,
8470 ImplicitParameter Param) const {
8471 MachineFunction &MF = DAG.getMachineFunction();
8472 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8473 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8474 MachinePointerInfo PtrInfo =
8476 return DAG.getLoad(
8477 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8479}
8480
8481SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8482 SelectionDAG &DAG) const {
8483 SDLoc SL(Op);
8484 SDValue Chain = Op.getOperand(0);
8485
8486 SDValue QueuePtr;
8487 // For code object version 5, QueuePtr is passed through implicit kernarg.
8488 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8490 QueuePtr =
8491 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8492 } else {
8493 MachineFunction &MF = DAG.getMachineFunction();
8494 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8495 Register UserSGPR = Info->getQueuePtrUserSGPR();
8496
8497 if (UserSGPR == AMDGPU::NoRegister) {
8498 // We probably are in a function incorrectly marked with
8499 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8500 // trap, so just use a null pointer.
8501 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8502 } else {
8503 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8504 MVT::i64);
8505 }
8506 }
8507
8508 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8509 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8510
8511 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8512 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8513 ToReg.getValue(1)};
8514 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8515}
8516
8517SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8518 SDLoc SL(Op);
8519 SDValue Chain = Op.getOperand(0);
8520
8521 // We need to simulate the 's_trap 2' instruction on targets that run in
8522 // PRIV=1 (where it is treated as a nop).
8523 if (Subtarget->hasPrivEnabledTrap2NopBug())
8524 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8525
8526 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8527 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8528 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8529}
8530
8531SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8532 SDLoc SL(Op);
8533 SDValue Chain = Op.getOperand(0);
8534 MachineFunction &MF = DAG.getMachineFunction();
8535
8536 if (!Subtarget->hasTrapHandler() ||
8537 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8538 LLVMContext &Ctx = MF.getFunction().getContext();
8539 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8540 "debugtrap handler not supported",
8541 Op.getDebugLoc(), DS_Warning));
8542 return Chain;
8543 }
8544
8545 uint64_t TrapID =
8546 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8547 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8548 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8549}
8550
8551SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8552 SelectionDAG &DAG) const {
8553 if (Subtarget->hasApertureRegs()) {
8554 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8555 ? AMDGPU::SRC_SHARED_BASE
8556 : AMDGPU::SRC_PRIVATE_BASE;
8557 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8558 !Subtarget->hasGloballyAddressableScratch()) &&
8559 "Cannot use src_private_base with globally addressable scratch!");
8560 // Note: this feature (register) is broken. When used as a 32-bit operand,
8561 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8562 // bits.
8563 //
8564 // To work around the issue, emit a 64 bit copy from this register
8565 // then extract the high bits. Note that this shouldn't even result in a
8566 // shift being emitted and simply become a pair of registers (e.g.):
8567 // s_mov_b64 s[6:7], src_shared_base
8568 // v_mov_b32_e32 v1, s7
8569 SDValue Copy =
8570 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8571 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8572 }
8573
8574 // For code object version 5, private_base and shared_base are passed through
8575 // implicit kernargs.
8576 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8580 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8581 }
8582
8583 MachineFunction &MF = DAG.getMachineFunction();
8584 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8585 Register UserSGPR = Info->getQueuePtrUserSGPR();
8586 if (UserSGPR == AMDGPU::NoRegister) {
8587 // We probably are in a function incorrectly marked with
8588 // amdgpu-no-queue-ptr. This is undefined.
8589 return DAG.getPOISON(MVT::i32);
8590 }
8591
8592 SDValue QueuePtr =
8593 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8594
8595 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8596 // private_segment_aperture_base_hi.
8597 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8598
8599 SDValue Ptr =
8600 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8601
8602 // TODO: Use custom target PseudoSourceValue.
8603 // TODO: We should use the value from the IR intrinsic call, but it might not
8604 // be available and how do we get it?
8605 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8606 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8607 commonAlignment(Align(64), StructOffset),
8610}
8611
8612/// Return true if the value is a known valid address, such that a null check is
8613/// not necessary.
8615 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8617 return true;
8618
8619 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8620 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8621
8622 // TODO: Search through arithmetic, handle arguments and loads
8623 // marked nonnull.
8624 return false;
8625}
8626
8627SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8628 SelectionDAG &DAG) const {
8629 SDLoc SL(Op);
8630
8631 const AMDGPUTargetMachine &TM =
8632 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8633
8634 unsigned DestAS, SrcAS;
8635 SDValue Src;
8636 bool IsNonNull = false;
8637 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8638 SrcAS = ASC->getSrcAddressSpace();
8639 Src = ASC->getOperand(0);
8640 DestAS = ASC->getDestAddressSpace();
8641 } else {
8642 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8643 Op.getConstantOperandVal(0) ==
8644 Intrinsic::amdgcn_addrspacecast_nonnull);
8645 Src = Op->getOperand(1);
8646 SrcAS = Op->getConstantOperandVal(2);
8647 DestAS = Op->getConstantOperandVal(3);
8648 IsNonNull = true;
8649 }
8650
8651 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8652
8653 // flat -> local/private
8654 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8655 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8656 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8657 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8658
8659 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8660 Subtarget->hasGloballyAddressableScratch()) {
8661 // flat -> private with globally addressable scratch: subtract
8662 // src_flat_scratch_base_lo.
8663 SDValue FlatScratchBaseLo(
8664 DAG.getMachineNode(
8665 AMDGPU::S_MOV_B32, SL, MVT::i32,
8666 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8667 0);
8668 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8669 }
8670
8671 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8672 return Ptr;
8673
8674 unsigned NullVal = TM.getNullPointerValue(DestAS);
8675 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8676 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8677
8678 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8679 SegmentNullPtr);
8680 }
8681 }
8682
8683 // local/private -> flat
8684 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8685 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8686 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8687 SDValue CvtPtr;
8688 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8689 Subtarget->hasGloballyAddressableScratch()) {
8690 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8691 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8692 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8693 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8694 ThreadID = DAG.getNode(
8695 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8696 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8697 AllOnes, ThreadID);
8698 if (Subtarget->isWave64())
8699 ThreadID = DAG.getNode(
8700 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8701 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8702 AllOnes, ThreadID);
8703 SDValue ShAmt = DAG.getShiftAmountConstant(
8704 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8705 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8706 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8707 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8708 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8709 // 64-bit hi:lo value.
8710 SDValue FlatScratchBase = {
8711 DAG.getMachineNode(
8712 AMDGPU::S_MOV_B64, SL, MVT::i64,
8713 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8714 0};
8715 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8716 } else {
8717 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8718 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8719 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8720 }
8721
8722 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8723 return CvtPtr;
8724
8725 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8726 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8727
8728 SDValue NonNull =
8729 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8730
8731 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8732 FlatNullPtr);
8733 }
8734 }
8735
8736 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8737 Op.getValueType() == MVT::i64) {
8738 const SIMachineFunctionInfo *Info =
8739 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8740 if (Info->get32BitAddressHighBits() == 0)
8741 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
8742
8743 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8744 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8745 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8746 }
8747
8748 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8749 Src.getValueType() == MVT::i64)
8750 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8751
8752 // global <-> flat are no-ops and never emitted.
8753
8754 // Invalid casts are poison.
8755 return DAG.getPOISON(Op->getValueType(0));
8756}
8757
8758// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8759// the small vector and inserting them into the big vector. That is better than
8760// the default expansion of doing it via a stack slot. Even though the use of
8761// the stack slot would be optimized away afterwards, the stack slot itself
8762// remains.
8763SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8764 SelectionDAG &DAG) const {
8765 SDValue Vec = Op.getOperand(0);
8766 SDValue Ins = Op.getOperand(1);
8767 SDValue Idx = Op.getOperand(2);
8768 EVT VecVT = Vec.getValueType();
8769 EVT InsVT = Ins.getValueType();
8770 EVT EltVT = VecVT.getVectorElementType();
8771 unsigned InsNumElts = InsVT.getVectorNumElements();
8772 unsigned IdxVal = Idx->getAsZExtVal();
8773 SDLoc SL(Op);
8774
8775 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8776 // Insert 32-bit registers at a time.
8777 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8778
8779 unsigned VecNumElts = VecVT.getVectorNumElements();
8780 EVT NewVecVT =
8781 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8782 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8784 MVT::i32, InsNumElts / 2);
8785
8786 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8787 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8788
8789 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8790 SDValue Elt;
8791 if (InsNumElts == 2) {
8792 Elt = Ins;
8793 } else {
8794 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8795 DAG.getConstant(I, SL, MVT::i32));
8796 }
8797 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8798 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8799 }
8800
8801 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8802 }
8803
8804 for (unsigned I = 0; I != InsNumElts; ++I) {
8805 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8806 DAG.getConstant(I, SL, MVT::i32));
8807 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8808 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8809 }
8810 return Vec;
8811}
8812
8813SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8814 SelectionDAG &DAG) const {
8815 SDValue Vec = Op.getOperand(0);
8816 SDValue InsVal = Op.getOperand(1);
8817 SDValue Idx = Op.getOperand(2);
8818 EVT VecVT = Vec.getValueType();
8819 EVT EltVT = VecVT.getVectorElementType();
8820 unsigned VecSize = VecVT.getSizeInBits();
8821 unsigned EltSize = EltVT.getSizeInBits();
8822 SDLoc SL(Op);
8823
8824 // Specially handle the case of v4i16 with static indexing.
8825 unsigned NumElts = VecVT.getVectorNumElements();
8826 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8827 if (NumElts == 4 && EltSize == 16 && KIdx) {
8828 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8829
8830 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8831 DAG.getConstant(0, SL, MVT::i32));
8832 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8833 DAG.getConstant(1, SL, MVT::i32));
8834
8835 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8836 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8837
8838 unsigned Idx = KIdx->getZExtValue();
8839 bool InsertLo = Idx < 2;
8840 SDValue InsHalf = DAG.getNode(
8841 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8842 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8843 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8844
8845 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8846
8847 SDValue Concat =
8848 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8849 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8850
8851 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8852 }
8853
8854 // Static indexing does not lower to stack access, and hence there is no need
8855 // for special custom lowering to avoid stack access.
8856 if (isa<ConstantSDNode>(Idx))
8857 return SDValue();
8858
8859 // Avoid stack access for dynamic indexing by custom lowering to
8860 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8861
8862 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8863
8864 MVT IntVT = MVT::getIntegerVT(VecSize);
8865
8866 // Convert vector index to bit-index and get the required bit mask.
8867 assert(isPowerOf2_32(EltSize));
8868 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8869 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8870 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8871 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8872 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8873
8874 // 1. Create a congruent vector with the target value in each element.
8875 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8876 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8877
8878 // 2. Mask off all other indices except the required index within (1).
8879 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8880
8881 // 3. Mask off the required index within the target vector.
8882 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8883 SDValue RHS =
8884 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8885
8886 // 4. Get (2) and (3) ORed into the target vector.
8887 SDValue BFI =
8888 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8889
8890 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8891}
8892
8893SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8894 SelectionDAG &DAG) const {
8895 SDLoc SL(Op);
8896
8897 EVT ResultVT = Op.getValueType();
8898 SDValue Vec = Op.getOperand(0);
8899 SDValue Idx = Op.getOperand(1);
8900 EVT VecVT = Vec.getValueType();
8901 unsigned VecSize = VecVT.getSizeInBits();
8902 EVT EltVT = VecVT.getVectorElementType();
8903
8904 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8905
8906 // Make sure we do any optimizations that will make it easier to fold
8907 // source modifiers before obscuring it with bit operations.
8908
8909 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8910 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8911 return Combined;
8912
8913 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8914 SDValue Lo, Hi;
8915 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8916
8917 if (VecSize == 128) {
8918 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8919 Lo = DAG.getBitcast(LoVT,
8920 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8921 DAG.getConstant(0, SL, MVT::i32)));
8922 Hi = DAG.getBitcast(HiVT,
8923 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8924 DAG.getConstant(1, SL, MVT::i32)));
8925 } else if (VecSize == 256) {
8926 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8927 SDValue Parts[4];
8928 for (unsigned P = 0; P < 4; ++P) {
8929 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8930 DAG.getConstant(P, SL, MVT::i32));
8931 }
8932
8933 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8934 Parts[0], Parts[1]));
8935 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8936 Parts[2], Parts[3]));
8937 } else {
8938 assert(VecSize == 512);
8939
8940 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8941 SDValue Parts[8];
8942 for (unsigned P = 0; P < 8; ++P) {
8943 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8944 DAG.getConstant(P, SL, MVT::i32));
8945 }
8946
8947 Lo = DAG.getBitcast(LoVT,
8948 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8949 Parts[0], Parts[1], Parts[2], Parts[3]));
8950 Hi = DAG.getBitcast(HiVT,
8951 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8952 Parts[4], Parts[5], Parts[6], Parts[7]));
8953 }
8954
8955 EVT IdxVT = Idx.getValueType();
8956 unsigned NElem = VecVT.getVectorNumElements();
8957 assert(isPowerOf2_32(NElem));
8958 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8959 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8960 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8961 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8962 }
8963
8964 assert(VecSize <= 64);
8965
8966 MVT IntVT = MVT::getIntegerVT(VecSize);
8967
8968 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8969 SDValue VecBC = peekThroughBitcasts(Vec);
8970 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8971 SDValue Src = VecBC.getOperand(0);
8972 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8973 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8974 }
8975
8976 unsigned EltSize = EltVT.getSizeInBits();
8977 assert(isPowerOf2_32(EltSize));
8978
8979 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8980
8981 // Convert vector index to bit-index (* EltSize)
8982 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8983
8984 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8985 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8986
8987 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8988 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8989 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8990 }
8991
8992 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8993}
8994
8995static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8996 assert(Elt % 2 == 0);
8997 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8998}
8999
9000static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
9001 assert(Elt % 2 == 0);
9002 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9003 !(Mask[Elt + 1] & 1);
9004}
9005
9006SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
9007 SelectionDAG &DAG) const {
9008 SDLoc SL(Op);
9009 EVT ResultVT = Op.getValueType();
9010 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
9011 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
9012 const int NewSrcNumElts = 2;
9013 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
9014 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
9015
9016 // Break up the shuffle into registers sized pieces.
9017 //
9018 // We're trying to form sub-shuffles that the register allocation pipeline
9019 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
9020 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
9021 // pair of copies into a consecutive register copy, so use the ordinary
9022 // extract_vector_elt lowering unless we can use the shuffle.
9023 //
9024 // TODO: This is a bit of hack, and we should probably always use
9025 // extract_subvector for the largest possible subvector we can (or at least
9026 // use it for PackVT aligned pieces). However we have worse support for
9027 // combines on them don't directly treat extract_subvector / insert_subvector
9028 // as legal. The DAG scheduler also ends up doing a worse job with the
9029 // extract_subvectors.
9030 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
9031
9032 // vector_shuffle <0,1,6,7> lhs, rhs
9033 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
9034 //
9035 // vector_shuffle <6,7,2,3> lhs, rhs
9036 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9037 //
9038 // vector_shuffle <6,7,0,1> lhs, rhs
9039 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9040
9041 // Avoid scalarizing when both halves are reading from consecutive elements.
9042
9043 // If we're treating 2 element shuffles as legal, also create odd-to-even
9044 // shuffles of neighboring pairs.
9045 //
9046 // vector_shuffle <3,2,7,6> lhs, rhs
9047 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9048 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
9049
9051 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
9052 if (ShouldUseConsecutiveExtract &&
9054 const int Idx = SVN->getMaskElt(I);
9055 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9056 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9057 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
9058 SVN->getOperand(VecIdx),
9059 DAG.getConstant(EltIdx, SL, MVT::i32));
9060 Pieces.push_back(SubVec);
9061 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
9063 int Idx0 = SVN->getMaskElt(I);
9064 int Idx1 = SVN->getMaskElt(I + 1);
9065
9066 SDValue SrcOp0 = SVN->getOperand(0);
9067 SDValue SrcOp1 = SrcOp0;
9068 if (Idx0 >= SrcNumElts) {
9069 SrcOp0 = SVN->getOperand(1);
9070 Idx0 -= SrcNumElts;
9071 }
9072
9073 if (Idx1 >= SrcNumElts) {
9074 SrcOp1 = SVN->getOperand(1);
9075 Idx1 -= SrcNumElts;
9076 }
9077
9078 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9079 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9080
9081 // Extract nearest even aligned piece.
9082 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
9083 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
9084 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
9085 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
9086
9087 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9088 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9089
9090 SDValue Result0 = SubVec0;
9091 SDValue Result1 = SubVec0;
9092
9093 if (SubVec0 != SubVec1) {
9094 NewMaskIdx1 += NewSrcNumElts;
9095 Result1 = SubVec1;
9096 } else {
9097 Result1 = DAG.getPOISON(PackVT);
9098 }
9099
9100 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
9101 {NewMaskIdx0, NewMaskIdx1});
9102 Pieces.push_back(Shuf);
9103 } else {
9104 const int Idx0 = SVN->getMaskElt(I);
9105 const int Idx1 = SVN->getMaskElt(I + 1);
9106 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9107 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9108 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9109 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9110
9111 SDValue Vec0 = SVN->getOperand(VecIdx0);
9112 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
9113 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
9114
9115 SDValue Vec1 = SVN->getOperand(VecIdx1);
9116 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
9117 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
9118 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
9119 }
9120 }
9121
9122 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
9123}
9124
9125SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9126 SelectionDAG &DAG) const {
9127 SDValue SVal = Op.getOperand(0);
9128 EVT ResultVT = Op.getValueType();
9129 EVT SValVT = SVal.getValueType();
9130 SDValue UndefVal = DAG.getPOISON(SValVT);
9131 SDLoc SL(Op);
9132
9134 VElts.push_back(SVal);
9135 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9136 VElts.push_back(UndefVal);
9137
9138 return DAG.getBuildVector(ResultVT, SL, VElts);
9139}
9140
9141SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9142 SelectionDAG &DAG) const {
9143 SDLoc SL(Op);
9144 EVT VT = Op.getValueType();
9145
9146 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9147 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9148
9149 SDValue Lo = Op.getOperand(0);
9150 SDValue Hi = Op.getOperand(1);
9151
9152 // Avoid adding defined bits with the zero_extend.
9153 if (Hi.isUndef()) {
9154 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9155 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
9156 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
9157 }
9158
9159 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
9160 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
9161
9162 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
9163 DAG.getConstant(16, SL, MVT::i32));
9164 if (Lo.isUndef())
9165 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
9166
9167 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9168 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
9169
9170 SDValue Or =
9171 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
9172 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
9173 }
9174
9175 // Split into 2-element chunks.
9176 const unsigned NumParts = VT.getVectorNumElements() / 2;
9177 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
9178 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
9179
9181 for (unsigned P = 0; P < NumParts; ++P) {
9182 SDValue Vec = DAG.getBuildVector(
9183 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
9184 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
9185 }
9186
9187 SDValue Blend =
9188 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
9189 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
9190}
9191
9193 const GlobalAddressSDNode *GA) const {
9194 // OSes that use ELF REL relocations (instead of RELA) can only store a
9195 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9196 // which can create arbitrary 64-bit addends. (This is only a problem for
9197 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9198 // the high 32 bits of the addend.)
9199 //
9200 // This should be kept in sync with how HasRelocationAddend is initialized in
9201 // the constructor of ELFAMDGPUAsmBackend.
9202 if (!Subtarget->isAmdHsaOS())
9203 return false;
9204
9205 // We can fold offsets for anything that doesn't require a GOT relocation.
9206 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9210}
9211
9212static SDValue
9214 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9215 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9216 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9217 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9218 // lowered to the following code sequence:
9219 //
9220 // For constant address space:
9221 // s_getpc_b64 s[0:1]
9222 // s_add_u32 s0, s0, $symbol
9223 // s_addc_u32 s1, s1, 0
9224 //
9225 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9226 // a fixup or relocation is emitted to replace $symbol with a literal
9227 // constant, which is a pc-relative offset from the encoding of the $symbol
9228 // operand to the global variable.
9229 //
9230 // For global address space:
9231 // s_getpc_b64 s[0:1]
9232 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9233 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9234 //
9235 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9236 // fixups or relocations are emitted to replace $symbol@*@lo and
9237 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9238 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9239 // operand to the global variable.
9240 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9241 assert(GAFlags != SIInstrInfo::MO_NONE);
9242
9243 SDValue Ptr =
9244 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
9245 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
9246 }
9247
9248 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
9249 SDValue PtrHi;
9250 if (GAFlags == SIInstrInfo::MO_NONE)
9251 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
9252 else
9253 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
9254 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
9255}
9256
9257SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
9258 SDValue Op,
9259 SelectionDAG &DAG) const {
9260 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
9261 SDLoc DL(GSD);
9262 EVT PtrVT = Op.getValueType();
9263
9264 const GlobalValue *GV = GSD->getGlobal();
9270 GV->hasExternalLinkage()) {
9271 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
9272 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9273 // zero-sized type in other languages to declare the dynamic shared
9274 // memory which size is not known at the compile time. They will be
9275 // allocated by the runtime and placed directly after the static
9276 // allocated ones. They all share the same offset.
9277 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
9278 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9279 // Adjust alignment for that dynamic shared memory array.
9281 MFI->setDynLDSAlign(F, GVar);
9282 MFI->setUsesDynamicLDS(true);
9283 return SDValue(
9284 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
9285 }
9286 }
9288 }
9289
9291 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
9293 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
9294 }
9295
9296 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9297 if (Subtarget->has64BitLiterals()) {
9299 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9300 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9301 0);
9302 }
9303
9304 SDValue AddrLo = DAG.getTargetGlobalAddress(
9305 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9306 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9307
9308 SDValue AddrHi = DAG.getTargetGlobalAddress(
9309 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9310 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9311
9312 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9313 }
9314
9315 if (shouldEmitFixup(GV))
9316 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9317
9318 if (shouldEmitPCReloc(GV))
9319 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9321
9322 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9324 PointerType *PtrTy =
9326 const DataLayout &DataLayout = DAG.getDataLayout();
9327 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9328 MachinePointerInfo PtrInfo =
9330
9331 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9334}
9335
9336SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9337 SelectionDAG &DAG) const {
9338 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9339 const Function &Fn = DAG.getMachineFunction().getFunction();
9340 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9341 Fn, "unsupported external symbol", Op.getDebugLoc()));
9342 return DAG.getPOISON(Op.getValueType());
9343}
9344
9346 const SDLoc &DL, SDValue V) const {
9347 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9348 // the destination register.
9349 //
9350 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9351 // so we will end up with redundant moves to m0.
9352 //
9353 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9354
9355 // A Null SDValue creates a glue result.
9356 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9357 V, Chain);
9358 return SDValue(M0, 0);
9359}
9360
9361SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9362 MVT VT,
9363 unsigned Offset) const {
9364 SDLoc SL(Op);
9365 SDValue Param = lowerKernargMemParameter(
9366 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9367 // The local size values will have the hi 16-bits as zero.
9368 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9369 DAG.getValueType(VT));
9370}
9371
9373 EVT VT) {
9376 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9377 return DAG.getPOISON(VT);
9378}
9379
9381 EVT VT) {
9384 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9385 return DAG.getPOISON(VT);
9386}
9387
9389 ArrayRef<SDValue> Elts) {
9390 assert(!Elts.empty());
9391 MVT Type;
9392 unsigned NumElts = Elts.size();
9393
9394 if (NumElts <= 12) {
9395 Type = MVT::getVectorVT(MVT::f32, NumElts);
9396 } else {
9397 assert(Elts.size() <= 16);
9398 Type = MVT::v16f32;
9399 NumElts = 16;
9400 }
9401
9402 SmallVector<SDValue, 16> VecElts(NumElts);
9403 for (unsigned i = 0; i < Elts.size(); ++i) {
9404 SDValue Elt = Elts[i];
9405 if (Elt.getValueType() != MVT::f32)
9406 Elt = DAG.getBitcast(MVT::f32, Elt);
9407 VecElts[i] = Elt;
9408 }
9409 for (unsigned i = Elts.size(); i < NumElts; ++i)
9410 VecElts[i] = DAG.getPOISON(MVT::f32);
9411
9412 if (NumElts == 1)
9413 return VecElts[0];
9414 return DAG.getBuildVector(Type, DL, VecElts);
9415}
9416
9417static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9418 SDValue Src, int ExtraElts) {
9419 EVT SrcVT = Src.getValueType();
9420
9422
9423 if (SrcVT.isVector())
9424 DAG.ExtractVectorElements(Src, Elts);
9425 else
9426 Elts.push_back(Src);
9427
9428 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9429 while (ExtraElts--)
9430 Elts.push_back(Undef);
9431
9432 return DAG.getBuildVector(CastVT, DL, Elts);
9433}
9434
9435// Re-construct the required return value for a image load intrinsic.
9436// This is more complicated due to the optional use TexFailCtrl which means the
9437// required return type is an aggregate
9439 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9440 bool Unpacked, bool IsD16, int DMaskPop,
9441 int NumVDataDwords, bool IsAtomicPacked16Bit,
9442 const SDLoc &DL) {
9443 // Determine the required return type. This is the same regardless of
9444 // IsTexFail flag
9445 EVT ReqRetVT = ResultTypes[0];
9446 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9447 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9448 ? (ReqRetNumElts + 1) / 2
9449 : ReqRetNumElts;
9450
9451 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9452
9453 MVT DataDwordVT =
9454 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9455
9456 MVT MaskPopVT =
9457 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9458
9459 SDValue Data(Result, 0);
9460 SDValue TexFail;
9461
9462 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9463 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9464 if (MaskPopVT.isVector()) {
9465 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9466 SDValue(Result, 0), ZeroIdx);
9467 } else {
9468 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9469 SDValue(Result, 0), ZeroIdx);
9470 }
9471 }
9472
9473 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9474 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9475 NumDataDwords - MaskPopDwords);
9476
9477 if (IsD16)
9478 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9479
9480 EVT LegalReqRetVT = ReqRetVT;
9481 if (!ReqRetVT.isVector()) {
9482 if (!Data.getValueType().isInteger())
9483 Data = DAG.getNode(ISD::BITCAST, DL,
9484 Data.getValueType().changeTypeToInteger(), Data);
9485 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9486 } else {
9487 // We need to widen the return vector to a legal type
9488 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9489 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9490 LegalReqRetVT =
9492 ReqRetVT.getVectorNumElements() + 1);
9493 }
9494 }
9495 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9496
9497 if (IsTexFail) {
9498 TexFail =
9499 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9500 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9501
9502 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9503 }
9504
9505 if (Result->getNumValues() == 1)
9506 return Data;
9507
9508 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9509}
9510
9511static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9512 SDValue *LWE, bool &IsTexFail) {
9513 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9514
9515 uint64_t Value = TexFailCtrlConst->getZExtValue();
9516 if (Value) {
9517 IsTexFail = true;
9518 }
9519
9520 SDLoc DL(TexFailCtrlConst);
9521 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9522 Value &= ~(uint64_t)0x1;
9523 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9524 Value &= ~(uint64_t)0x2;
9525
9526 return Value == 0;
9527}
9528
9530 MVT PackVectorVT,
9531 SmallVectorImpl<SDValue> &PackedAddrs,
9532 unsigned DimIdx, unsigned EndIdx,
9533 unsigned NumGradients) {
9534 SDLoc DL(Op);
9535 for (unsigned I = DimIdx; I < EndIdx; I++) {
9536 SDValue Addr = Op.getOperand(I);
9537
9538 // Gradients are packed with undef for each coordinate.
9539 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9540 // 1D: undef,dx/dh; undef,dx/dv
9541 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9542 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9543 if (((I + 1) >= EndIdx) ||
9544 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9545 I == DimIdx + NumGradients - 1))) {
9546 if (Addr.getValueType() != MVT::i16)
9547 Addr = DAG.getBitcast(MVT::i16, Addr);
9548 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9549 } else {
9550 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9551 I++;
9552 }
9553 Addr = DAG.getBitcast(MVT::f32, Addr);
9554 PackedAddrs.push_back(Addr);
9555 }
9556}
9557
9558SDValue SITargetLowering::lowerImage(SDValue Op,
9560 SelectionDAG &DAG, bool WithChain) const {
9561 SDLoc DL(Op);
9562 MachineFunction &MF = DAG.getMachineFunction();
9563 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9564 unsigned IntrOpcode = Intr->BaseOpcode;
9565 // For image atomic: use no-return opcode if result is unused.
9566 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9567 !Op.getNode()->hasAnyUseOfValue(0))
9568 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9569 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9571 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9572 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9573 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9574 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9575
9576 SmallVector<EVT, 3> ResultTypes(Op->values());
9577 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9578 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9579 ResultTypes.erase(&ResultTypes[0]);
9580
9581 bool IsD16 = false;
9582 bool IsG16 = false;
9583 bool IsA16 = false;
9584 SDValue VData;
9585 int NumVDataDwords = 0;
9586 bool AdjustRetType = false;
9587 bool IsAtomicPacked16Bit = false;
9588
9589 // Offset of intrinsic arguments
9590 const unsigned ArgOffset = WithChain ? 2 : 1;
9591
9592 unsigned DMask;
9593 unsigned DMaskLanes = 0;
9594
9595 if (BaseOpcode->Atomic) {
9596 VData = Op.getOperand(2);
9597
9598 IsAtomicPacked16Bit =
9599 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9600 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9601 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9602 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9603
9604 bool Is64Bit = VData.getValueSizeInBits() == 64;
9605 if (BaseOpcode->AtomicX2) {
9606 SDValue VData2 = Op.getOperand(3);
9607 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9608 {VData, VData2});
9609 if (Is64Bit)
9610 VData = DAG.getBitcast(MVT::v4i32, VData);
9611
9612 if (!BaseOpcode->NoReturn)
9613 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9614
9615 DMask = Is64Bit ? 0xf : 0x3;
9616 NumVDataDwords = Is64Bit ? 4 : 2;
9617 } else {
9618 DMask = Is64Bit ? 0x3 : 0x1;
9619 NumVDataDwords = Is64Bit ? 2 : 1;
9620 }
9621 } else {
9622 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9623 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9624
9625 if (BaseOpcode->Store) {
9626 VData = Op.getOperand(2);
9627
9628 MVT StoreVT = VData.getSimpleValueType();
9629 if (StoreVT.getScalarType() == MVT::f16) {
9630 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9631 return Op; // D16 is unsupported for this instruction
9632
9633 IsD16 = true;
9634 VData = handleD16VData(VData, DAG, true);
9635 }
9636
9637 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9638 } else if (!BaseOpcode->NoReturn) {
9639 // Work out the num dwords based on the dmask popcount and underlying type
9640 // and whether packing is supported.
9641 MVT LoadVT = ResultTypes[0].getSimpleVT();
9642 if (LoadVT.getScalarType() == MVT::f16) {
9643 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9644 return Op; // D16 is unsupported for this instruction
9645
9646 IsD16 = true;
9647 }
9648
9649 // Confirm that the return type is large enough for the dmask specified
9650 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9651 (!LoadVT.isVector() && DMaskLanes > 1))
9652 return Op;
9653
9654 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9655 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9656 // instructions.
9657 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9658 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9659 NumVDataDwords = (DMaskLanes + 1) / 2;
9660 else
9661 NumVDataDwords = DMaskLanes;
9662
9663 AdjustRetType = true;
9664 }
9665 }
9666
9667 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9669
9670 // Check for 16 bit addresses or derivatives and pack if true.
9671 MVT VAddrVT =
9672 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9673 MVT VAddrScalarVT = VAddrVT.getScalarType();
9674 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9675 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9676
9677 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9678 VAddrScalarVT = VAddrVT.getScalarType();
9679 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9680 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9681
9682 // Push back extra arguments.
9683 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9684 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9685 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9686 // Special handling of bias when A16 is on. Bias is of type half but
9687 // occupies full 32-bit.
9688 SDValue Bias = DAG.getBuildVector(
9689 MVT::v2f16, DL,
9690 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9691 VAddrs.push_back(Bias);
9692 } else {
9693 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9694 "Bias needs to be converted to 16 bit in A16 mode");
9695 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9696 }
9697 }
9698
9699 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9700 // 16 bit gradients are supported, but are tied to the A16 control
9701 // so both gradients and addresses must be 16 bit
9702 LLVM_DEBUG(
9703 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9704 "require 16 bit args for both gradients and addresses");
9705 return Op;
9706 }
9707
9708 if (IsA16) {
9709 if (!ST->hasA16()) {
9710 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9711 "support 16 bit addresses\n");
9712 return Op;
9713 }
9714 }
9715
9716 // We've dealt with incorrect input so we know that if IsA16, IsG16
9717 // are set then we have to compress/pack operands (either address,
9718 // gradient or both)
9719 // In the case where a16 and gradients are tied (no G16 support) then we
9720 // have already verified that both IsA16 and IsG16 are true
9721 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9722 // Activate g16
9723 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9725 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9726 }
9727
9728 // Add gradients (packed or unpacked)
9729 if (IsG16) {
9730 // Pack the gradients
9731 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9732 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9733 ArgOffset + Intr->GradientStart,
9734 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9735 } else {
9736 for (unsigned I = ArgOffset + Intr->GradientStart;
9737 I < ArgOffset + Intr->CoordStart; I++)
9738 VAddrs.push_back(Op.getOperand(I));
9739 }
9740
9741 // Add addresses (packed or unpacked)
9742 if (IsA16) {
9743 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9744 ArgOffset + Intr->CoordStart, VAddrEnd,
9745 0 /* No gradients */);
9746 } else {
9747 // Add uncompressed address
9748 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9749 VAddrs.push_back(Op.getOperand(I));
9750 }
9751
9752 // If the register allocator cannot place the address registers contiguously
9753 // without introducing moves, then using the non-sequential address encoding
9754 // is always preferable, since it saves VALU instructions and is usually a
9755 // wash in terms of code size or even better.
9756 //
9757 // However, we currently have no way of hinting to the register allocator that
9758 // MIMG addresses should be placed contiguously when it is possible to do so,
9759 // so force non-NSA for the common 2-address case as a heuristic.
9760 //
9761 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9762 // allocation when possible.
9763 //
9764 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9765 // set of the remaining addresses.
9766 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9767 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9768 const bool UseNSA = ST->hasNSAEncoding() &&
9769 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9770 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9771 const bool UsePartialNSA =
9772 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9773
9774 SDValue VAddr;
9775 if (UsePartialNSA) {
9776 VAddr = getBuildDwordsVector(DAG, DL,
9777 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9778 } else if (!UseNSA) {
9779 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9780 }
9781
9782 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9783 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9784 SDValue Unorm;
9785 if (!BaseOpcode->Sampler) {
9786 Unorm = True;
9787 } else {
9788 uint64_t UnormConst =
9789 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9790
9791 Unorm = UnormConst ? True : False;
9792 }
9793
9794 SDValue TFE;
9795 SDValue LWE;
9796 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9797 bool IsTexFail = false;
9798 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9799 return Op;
9800
9801 if (IsTexFail) {
9802 if (!DMaskLanes) {
9803 // Expecting to get an error flag since TFC is on - and dmask is 0
9804 // Force dmask to be at least 1 otherwise the instruction will fail
9805 DMask = 0x1;
9806 DMaskLanes = 1;
9807 NumVDataDwords = 1;
9808 }
9809 NumVDataDwords += 1;
9810 AdjustRetType = true;
9811 }
9812
9813 // Has something earlier tagged that the return type needs adjusting
9814 // This happens if the instruction is a load or has set TexFailCtrl flags
9815 if (AdjustRetType) {
9816 // NumVDataDwords reflects the true number of dwords required in the return
9817 // type
9818 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9819 // This is a no-op load. This can be eliminated
9820 SDValue Undef = DAG.getPOISON(Op.getValueType());
9821 if (isa<MemSDNode>(Op))
9822 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9823 return Undef;
9824 }
9825
9826 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9827 MVT::i32, NumVDataDwords)
9828 : MVT::i32;
9829
9830 ResultTypes[0] = NewVT;
9831 if (ResultTypes.size() == 3) {
9832 // Original result was aggregate type used for TexFailCtrl results
9833 // The actual instruction returns as a vector type which has now been
9834 // created. Remove the aggregate result.
9835 ResultTypes.erase(&ResultTypes[1]);
9836 }
9837 }
9838
9839 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9840 // Keep GLC only when the atomic's result is actually used.
9841 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9843 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9845 return Op;
9846
9848 if (BaseOpcode->Store || BaseOpcode->Atomic)
9849 Ops.push_back(VData); // vdata
9850 if (UsePartialNSA) {
9851 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9852 Ops.push_back(VAddr);
9853 } else if (UseNSA)
9854 append_range(Ops, VAddrs);
9855 else
9856 Ops.push_back(VAddr);
9857 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9858 EVT RsrcVT = Rsrc.getValueType();
9859 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9860 return Op;
9861 Ops.push_back(Rsrc);
9862 if (BaseOpcode->Sampler) {
9863 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9864 if (Samp.getValueType() != MVT::v4i32)
9865 return Op;
9866 Ops.push_back(Samp);
9867 }
9868 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9869 if (IsGFX10Plus)
9870 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9871 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9872 Ops.push_back(Unorm);
9873 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9874 Ops.push_back(IsA16 && // r128, a16 for gfx9
9875 ST->hasFeature(AMDGPU::FeatureR128A16)
9876 ? True
9877 : False);
9878 if (IsGFX10Plus)
9879 Ops.push_back(IsA16 ? True : False);
9880
9881 if (!Subtarget->hasGFX90AInsts())
9882 Ops.push_back(TFE); // tfe
9883 else if (TFE->getAsZExtVal()) {
9884 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9886 "TFE is not supported on this GPU", DL.getDebugLoc()));
9887 }
9888
9889 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9890 Ops.push_back(LWE); // lwe
9891 if (!IsGFX10Plus)
9892 Ops.push_back(DimInfo->DA ? True : False);
9893 if (BaseOpcode->HasD16)
9894 Ops.push_back(IsD16 ? True : False);
9895 if (isa<MemSDNode>(Op))
9896 Ops.push_back(Op.getOperand(0)); // chain
9897
9898 int NumVAddrDwords =
9899 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9900 int Opcode = -1;
9901
9902 if (IsGFX12Plus) {
9903 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9904 NumVDataDwords, NumVAddrDwords);
9905 } else if (IsGFX11Plus) {
9906 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9907 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9908 : AMDGPU::MIMGEncGfx11Default,
9909 NumVDataDwords, NumVAddrDwords);
9910 } else if (IsGFX10Plus) {
9911 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9912 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9913 : AMDGPU::MIMGEncGfx10Default,
9914 NumVDataDwords, NumVAddrDwords);
9915 } else {
9916 if (Subtarget->hasGFX90AInsts()) {
9917 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9918 NumVDataDwords, NumVAddrDwords);
9919 if (Opcode == -1) {
9920 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9922 "requested image instruction is not supported on this GPU",
9923 DL.getDebugLoc()));
9924
9925 unsigned Idx = 0;
9926 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9927 for (EVT VT : OrigResultTypes) {
9928 if (VT == MVT::Other)
9929 RetValues[Idx++] = Op.getOperand(0); // Chain
9930 else
9931 RetValues[Idx++] = DAG.getPOISON(VT);
9932 }
9933
9934 return DAG.getMergeValues(RetValues, DL);
9935 }
9936 }
9937 if (Opcode == -1 &&
9938 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9939 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9940 NumVDataDwords, NumVAddrDwords);
9941 if (Opcode == -1)
9942 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9943 NumVDataDwords, NumVAddrDwords);
9944 }
9945 if (Opcode == -1)
9946 return Op;
9947
9948 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9949 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9950 MachineMemOperand *MemRef = MemOp->getMemOperand();
9951 DAG.setNodeMemRefs(NewNode, {MemRef});
9952 }
9953
9954 if (BaseOpcode->NoReturn) {
9955 if (BaseOpcode->Atomic)
9956 return DAG.getMergeValues(
9957 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9958
9959 return SDValue(NewNode, 0);
9960 }
9961
9962 if (BaseOpcode->AtomicX2) {
9964 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9965 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9966 }
9967
9968 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9969 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9970 NumVDataDwords, IsAtomicPacked16Bit, DL);
9971}
9972
9973SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9974 SDValue Offset, SDValue CachePolicy,
9975 SelectionDAG &DAG) const {
9976 MachineFunction &MF = DAG.getMachineFunction();
9977
9978 const DataLayout &DataLayout = DAG.getDataLayout();
9979 Align Alignment =
9980 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9981
9982 MachineMemOperand *MMO = MF.getMachineMemOperand(
9983 MachinePointerInfo(),
9986 VT.getStoreSize(), Alignment);
9987
9988 if (!Offset->isDivergent()) {
9989 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9990
9991 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9992 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9993 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9994 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9995 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9996 SDValue BufferLoad =
9997 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
9998 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9999 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
10000 }
10001
10002 // Widen vec3 load to vec4.
10003 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
10004 !Subtarget->hasScalarDwordx3Loads()) {
10005 EVT WidenedVT =
10007 auto WidenedOp = DAG.getMemIntrinsicNode(
10008 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
10009 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
10010 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
10011 DAG.getVectorIdxConstant(0, DL));
10012 return Subvector;
10013 }
10014
10015 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
10016 DAG.getVTList(VT), Ops, VT, MMO);
10017 }
10018
10019 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
10020 // assume that the buffer is unswizzled.
10021 SDValue Ops[] = {
10022 DAG.getEntryNode(), // Chain
10023 Rsrc, // rsrc
10024 DAG.getConstant(0, DL, MVT::i32), // vindex
10025 {}, // voffset
10026 {}, // soffset
10027 {}, // offset
10028 CachePolicy, // cachepolicy
10029 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10030 };
10031 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10032 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
10033 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
10034 }
10035
10037 unsigned NumLoads = 1;
10038 MVT LoadVT = VT.getSimpleVT();
10039 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
10040 assert((LoadVT.getScalarType() == MVT::i32 ||
10041 LoadVT.getScalarType() == MVT::f32));
10042
10043 if (NumElts == 8 || NumElts == 16) {
10044 NumLoads = NumElts / 4;
10045 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
10046 }
10047
10048 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
10049
10050 // Use the alignment to ensure that the required offsets will fit into the
10051 // immediate offsets.
10052 setBufferOffsets(Offset, DAG, &Ops[3],
10053 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
10054
10055 uint64_t InstOffset = Ops[5]->getAsZExtVal();
10056 for (unsigned i = 0; i < NumLoads; ++i) {
10057 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
10058 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10059 LoadVT, MMO, DAG));
10060 }
10061
10062 if (NumElts == 8 || NumElts == 16)
10063 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
10064
10065 return Loads[0];
10066}
10067
10068SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10069 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10070 if (!Subtarget->hasArchitectedSGPRs())
10071 return {};
10072 SDLoc SL(Op);
10073 MVT VT = MVT::i32;
10074 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
10075 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10076 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
10077}
10078
10079SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10080 AMDGPU::Hwreg::Id HwReg,
10081 unsigned LowBit,
10082 unsigned Width) const {
10083 SDLoc SL(Op);
10084 using namespace AMDGPU::Hwreg;
10085 return {DAG.getMachineNode(
10086 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10087 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
10088 SL, MVT::i32)),
10089 0};
10090}
10091
10092SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10093 unsigned Dim,
10094 const ArgDescriptor &Arg) const {
10095 SDLoc SL(Op);
10096 MachineFunction &MF = DAG.getMachineFunction();
10097 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
10098 if (MaxID == 0)
10099 return DAG.getConstant(0, SL, MVT::i32);
10100
10101 // It's undefined behavior if a function marked with the amdgpu-no-*
10102 // attributes uses the corresponding intrinsic.
10103 if (!Arg)
10104 return DAG.getPOISON(Op->getValueType(0));
10105
10106 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
10107 SDLoc(DAG.getEntryNode()), Arg);
10108
10109 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10110 // masking operations anyway.
10111 //
10112 // TODO: We could assert the top bit is 0 for the source copy.
10113 if (Arg.isMasked())
10114 return Val;
10115
10116 // Preserve the known bits after expansion to a copy.
10117 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
10118 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
10119 DAG.getValueType(SmallVT));
10120}
10121
10122SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10123 SelectionDAG &DAG) const {
10124 MachineFunction &MF = DAG.getMachineFunction();
10125 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10126
10127 EVT VT = Op.getValueType();
10128 SDLoc DL(Op);
10129 unsigned IntrinsicID = Op.getConstantOperandVal(0);
10130
10131 // TODO: Should this propagate fast-math-flags?
10132
10133 switch (IntrinsicID) {
10134 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10135 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
10136 return emitNonHSAIntrinsicError(DAG, DL, VT);
10137 return getPreloadedValue(DAG, *MFI, VT,
10139 }
10140 case Intrinsic::amdgcn_dispatch_ptr:
10141 case Intrinsic::amdgcn_queue_ptr: {
10142 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
10143 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10144 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10145 DL.getDebugLoc()));
10146 return DAG.getPOISON(VT);
10147 }
10148
10149 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10152 return getPreloadedValue(DAG, *MFI, VT, RegID);
10153 }
10154 case Intrinsic::amdgcn_implicitarg_ptr: {
10155 if (MFI->isEntryFunction())
10156 return getImplicitArgPtr(DAG, DL);
10157 return getPreloadedValue(DAG, *MFI, VT,
10159 }
10160 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10161 if (!AMDGPU::isKernel(MF.getFunction())) {
10162 // This only makes sense to call in a kernel, so just lower to null.
10163 return DAG.getConstant(0, DL, VT);
10164 }
10165
10166 return getPreloadedValue(DAG, *MFI, VT,
10168 }
10169 case Intrinsic::amdgcn_dispatch_id: {
10170 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
10171 }
10172 case Intrinsic::amdgcn_rcp:
10173 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
10174 case Intrinsic::amdgcn_rsq:
10175 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10176 case Intrinsic::amdgcn_rsq_legacy:
10177 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10178 return emitRemovedIntrinsicError(DAG, DL, VT);
10179 return SDValue();
10180 case Intrinsic::amdgcn_rcp_legacy:
10181 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10182 return emitRemovedIntrinsicError(DAG, DL, VT);
10183 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
10184 case Intrinsic::amdgcn_rsq_clamp: {
10185 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10186 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
10187
10188 Type *Type = VT.getTypeForEVT(*DAG.getContext());
10189 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
10190 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
10191
10192 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10193 SDValue Tmp =
10194 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
10195 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
10196 DAG.getConstantFP(Min, DL, VT));
10197 }
10198 case Intrinsic::r600_read_ngroups_x:
10199 if (Subtarget->isAmdHsaOS())
10200 return emitNonHSAIntrinsicError(DAG, DL, VT);
10201
10202 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10204 false);
10205 case Intrinsic::r600_read_ngroups_y:
10206 if (Subtarget->isAmdHsaOS())
10207 return emitNonHSAIntrinsicError(DAG, DL, VT);
10208
10209 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10211 false);
10212 case Intrinsic::r600_read_ngroups_z:
10213 if (Subtarget->isAmdHsaOS())
10214 return emitNonHSAIntrinsicError(DAG, DL, VT);
10215
10216 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10218 false);
10219 case Intrinsic::r600_read_local_size_x:
10220 if (Subtarget->isAmdHsaOS())
10221 return emitNonHSAIntrinsicError(DAG, DL, VT);
10222
10223 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10225 case Intrinsic::r600_read_local_size_y:
10226 if (Subtarget->isAmdHsaOS())
10227 return emitNonHSAIntrinsicError(DAG, DL, VT);
10228
10229 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10231 case Intrinsic::r600_read_local_size_z:
10232 if (Subtarget->isAmdHsaOS())
10233 return emitNonHSAIntrinsicError(DAG, DL, VT);
10234
10235 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10237 case Intrinsic::amdgcn_workgroup_id_x:
10238 return lowerWorkGroupId(DAG, *MFI, VT,
10242 case Intrinsic::amdgcn_workgroup_id_y:
10243 return lowerWorkGroupId(DAG, *MFI, VT,
10247 case Intrinsic::amdgcn_workgroup_id_z:
10248 return lowerWorkGroupId(DAG, *MFI, VT,
10252 case Intrinsic::amdgcn_cluster_id_x:
10253 return Subtarget->hasClusters()
10254 ? getPreloadedValue(DAG, *MFI, VT,
10256 : DAG.getPOISON(VT);
10257 case Intrinsic::amdgcn_cluster_id_y:
10258 return Subtarget->hasClusters()
10259 ? getPreloadedValue(DAG, *MFI, VT,
10261 : DAG.getPOISON(VT);
10262 case Intrinsic::amdgcn_cluster_id_z:
10263 return Subtarget->hasClusters()
10264 ? getPreloadedValue(DAG, *MFI, VT,
10266 : DAG.getPOISON(VT);
10267 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10268 return Subtarget->hasClusters()
10269 ? getPreloadedValue(
10270 DAG, *MFI, VT,
10272 : DAG.getPOISON(VT);
10273 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10274 return Subtarget->hasClusters()
10275 ? getPreloadedValue(
10276 DAG, *MFI, VT,
10278 : DAG.getPOISON(VT);
10279 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10280 return Subtarget->hasClusters()
10281 ? getPreloadedValue(
10282 DAG, *MFI, VT,
10284 : DAG.getPOISON(VT);
10285 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10286 return Subtarget->hasClusters()
10287 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
10288 : SDValue();
10289 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10290 return Subtarget->hasClusters()
10291 ? getPreloadedValue(
10292 DAG, *MFI, VT,
10294 : DAG.getPOISON(VT);
10295 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10296 return Subtarget->hasClusters()
10297 ? getPreloadedValue(
10298 DAG, *MFI, VT,
10300 : DAG.getPOISON(VT);
10301 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10302 return Subtarget->hasClusters()
10303 ? getPreloadedValue(
10304 DAG, *MFI, VT,
10306 : DAG.getPOISON(VT);
10307 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10308 return Subtarget->hasClusters()
10309 ? getPreloadedValue(
10310 DAG, *MFI, VT,
10312 : DAG.getPOISON(VT);
10313 case Intrinsic::amdgcn_wave_id:
10314 return lowerWaveID(DAG, Op);
10315 case Intrinsic::amdgcn_lds_kernel_id: {
10316 if (MFI->isEntryFunction())
10317 return getLDSKernelId(DAG, DL);
10318 return getPreloadedValue(DAG, *MFI, VT,
10320 }
10321 case Intrinsic::amdgcn_workitem_id_x:
10322 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10323 case Intrinsic::amdgcn_workitem_id_y:
10324 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10325 case Intrinsic::amdgcn_workitem_id_z:
10326 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10327 case Intrinsic::amdgcn_wavefrontsize:
10328 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10329 SDLoc(Op), MVT::i32);
10330 case Intrinsic::amdgcn_s_buffer_load: {
10331 unsigned CPol = Op.getConstantOperandVal(3);
10332 // s_buffer_load, because of how it's optimized, can't be volatile
10333 // so reject ones with the volatile bit set.
10334 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10337 return Op;
10338 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10339 Op.getOperand(3), DAG);
10340 }
10341 case Intrinsic::amdgcn_fdiv_fast:
10342 return lowerFDIV_FAST(Op, DAG);
10343 case Intrinsic::amdgcn_sin:
10344 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10345
10346 case Intrinsic::amdgcn_cos:
10347 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10348
10349 case Intrinsic::amdgcn_mul_u24:
10350 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10351 Op.getOperand(2));
10352 case Intrinsic::amdgcn_mul_i24:
10353 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10354 Op.getOperand(2));
10355
10356 case Intrinsic::amdgcn_log_clamp: {
10357 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10358 return SDValue();
10359
10360 return emitRemovedIntrinsicError(DAG, DL, VT);
10361 }
10362 case Intrinsic::amdgcn_fract:
10363 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10364
10365 case Intrinsic::amdgcn_class:
10366 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10367 Op.getOperand(2));
10368 case Intrinsic::amdgcn_div_fmas:
10369 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10370 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10371
10372 case Intrinsic::amdgcn_div_fixup:
10373 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10374 Op.getOperand(2), Op.getOperand(3));
10375
10376 case Intrinsic::amdgcn_div_scale: {
10377 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10378
10379 // Translate to the operands expected by the machine instruction. The
10380 // first parameter must be the same as the first instruction.
10381 SDValue Numerator = Op.getOperand(1);
10382 SDValue Denominator = Op.getOperand(2);
10383
10384 // Note this order is opposite of the machine instruction's operations,
10385 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10386 // intrinsic has the numerator as the first operand to match a normal
10387 // division operation.
10388
10389 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10390
10391 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10392 Denominator, Numerator);
10393 }
10394 case Intrinsic::amdgcn_icmp: {
10395 // There is a Pat that handles this variant, so return it as-is.
10396 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10397 Op.getConstantOperandVal(2) == 0 &&
10398 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10399 return Op;
10400 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10401 }
10402 case Intrinsic::amdgcn_fcmp: {
10403 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10404 }
10405 case Intrinsic::amdgcn_ballot:
10406 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10407 case Intrinsic::amdgcn_fmed3:
10408 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10409 Op.getOperand(2), Op.getOperand(3));
10410 case Intrinsic::amdgcn_fdot2:
10411 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10412 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10413 case Intrinsic::amdgcn_fmul_legacy:
10414 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10415 Op.getOperand(2));
10416 case Intrinsic::amdgcn_sffbh:
10417 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10418 case Intrinsic::amdgcn_sbfe:
10419 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10420 Op.getOperand(2), Op.getOperand(3));
10421 case Intrinsic::amdgcn_ubfe:
10422 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10423 Op.getOperand(2), Op.getOperand(3));
10424 case Intrinsic::amdgcn_cvt_pkrtz:
10425 case Intrinsic::amdgcn_cvt_pknorm_i16:
10426 case Intrinsic::amdgcn_cvt_pknorm_u16:
10427 case Intrinsic::amdgcn_cvt_pk_i16:
10428 case Intrinsic::amdgcn_cvt_pk_u16: {
10429 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10430 EVT VT = Op.getValueType();
10431 unsigned Opcode;
10432
10433 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10434 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10435 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10436 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10437 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10438 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10439 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10440 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10441 else
10442 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10443
10444 if (isTypeLegal(VT))
10445 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10446
10447 SDValue Node =
10448 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10449 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10450 }
10451 case Intrinsic::amdgcn_fmad_ftz:
10452 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10453 Op.getOperand(2), Op.getOperand(3));
10454
10455 case Intrinsic::amdgcn_if_break:
10456 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10457 Op->getOperand(1), Op->getOperand(2)),
10458 0);
10459
10460 case Intrinsic::amdgcn_groupstaticsize: {
10462 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10463 return Op;
10464
10465 const Module *M = MF.getFunction().getParent();
10466 const GlobalValue *GV =
10467 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10468 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10470 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10471 }
10472 case Intrinsic::amdgcn_is_shared:
10473 case Intrinsic::amdgcn_is_private: {
10474 SDLoc SL(Op);
10475 SDValue SrcVec =
10476 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10477 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10478 DAG.getConstant(1, SL, MVT::i32));
10479
10480 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10482 : AMDGPUAS::PRIVATE_ADDRESS;
10483 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10484 Subtarget->hasGloballyAddressableScratch()) {
10485 SDValue FlatScratchBaseHi(
10486 DAG.getMachineNode(
10487 AMDGPU::S_MOV_B32, DL, MVT::i32,
10488 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10489 0);
10490 // Test bits 63..58 against the aperture address.
10491 return DAG.getSetCC(
10492 SL, MVT::i1,
10493 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10494 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10495 }
10496
10497 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10498 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10499 }
10500 case Intrinsic::amdgcn_perm:
10501 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10502 Op.getOperand(2), Op.getOperand(3));
10503 case Intrinsic::amdgcn_reloc_constant: {
10504 Module *M = MF.getFunction().getParent();
10505 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10506 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10507 auto *RelocSymbol = cast<GlobalVariable>(
10508 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10509 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10511 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10512 }
10513 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10514 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10515 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10516 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10517 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10518 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10519 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10520 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10521 if (Op.getOperand(4).getValueType() == MVT::i32)
10522 return SDValue();
10523
10524 SDLoc SL(Op);
10525 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10526 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10527 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10528 Op.getOperand(3), IndexKeyi32);
10529 }
10530 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10531 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10532 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10533 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10534 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10535 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10536 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10537 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10538 if (Op.getOperand(4).getValueType() == MVT::i64)
10539 return SDValue();
10540
10541 SDLoc SL(Op);
10542 auto IndexKeyi64 =
10543 Op.getOperand(4).getValueType() == MVT::v2i32
10544 ? DAG.getBitcast(MVT::i64, Op.getOperand(4))
10545 : DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10546 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10547 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10548 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10549 Op.getOperand(6)});
10550 }
10551 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10552 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10553 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10554 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10555 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10556 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10557 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10558 ? MVT::i64
10559 : MVT::i32;
10560 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10561 return SDValue();
10562
10563 SDLoc SL(Op);
10564 auto IndexKey =
10565 Op.getOperand(6).getValueType().isVector()
10566 ? DAG.getBitcast(IndexKeyTy, Op.getOperand(6))
10567 : DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10569 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10570 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10571 IndexKey, Op.getOperand(7), Op.getOperand(8)};
10572 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10573 Args.push_back(Op.getOperand(9));
10574 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
10575 }
10576 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10577 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10578 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10579 if (Op.getOperand(6).getValueType() == MVT::i32)
10580 return SDValue();
10581
10582 SDLoc SL(Op);
10583 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10584 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10585 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10586 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10587 IndexKeyi32, Op.getOperand(7)});
10588 }
10589 case Intrinsic::amdgcn_addrspacecast_nonnull:
10590 return lowerADDRSPACECAST(Op, DAG);
10591 case Intrinsic::amdgcn_readlane:
10592 case Intrinsic::amdgcn_readfirstlane:
10593 case Intrinsic::amdgcn_writelane:
10594 case Intrinsic::amdgcn_permlane16:
10595 case Intrinsic::amdgcn_permlanex16:
10596 case Intrinsic::amdgcn_permlane64:
10597 case Intrinsic::amdgcn_set_inactive:
10598 case Intrinsic::amdgcn_set_inactive_chain_arg:
10599 case Intrinsic::amdgcn_mov_dpp8:
10600 case Intrinsic::amdgcn_update_dpp:
10601 return lowerLaneOp(*this, Op.getNode(), DAG);
10602 case Intrinsic::amdgcn_dead: {
10604 for (const EVT ValTy : Op.getNode()->values())
10605 Poisons.push_back(DAG.getPOISON(ValTy));
10606 return DAG.getMergeValues(Poisons, SDLoc(Op));
10607 }
10608 case Intrinsic::amdgcn_wave_shuffle:
10609 return lowerWaveShuffle(*this, Op.getNode(), DAG);
10610 default:
10611 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10613 return lowerImage(Op, ImageDimIntr, DAG, false);
10614
10615 return Op;
10616 }
10617}
10618
10619// On targets not supporting constant in soffset field, turn zero to
10620// SGPR_NULL to avoid generating an extra s_mov with zero.
10622 const GCNSubtarget *Subtarget) {
10623 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10624 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10625 return SOffset;
10626}
10627
10628SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10629 SelectionDAG &DAG,
10630 unsigned NewOpcode) const {
10631 SDLoc DL(Op);
10632
10633 SDValue VData = Op.getOperand(2);
10634 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10635 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10636 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10637 SDValue Ops[] = {
10638 Op.getOperand(0), // Chain
10639 VData, // vdata
10640 Rsrc, // rsrc
10641 DAG.getConstant(0, DL, MVT::i32), // vindex
10642 VOffset, // voffset
10643 SOffset, // soffset
10644 Offset, // offset
10645 Op.getOperand(6), // cachepolicy
10646 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10647 };
10648
10649 auto *M = cast<MemSDNode>(Op);
10650
10651 EVT MemVT = VData.getValueType();
10652 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10653 M->getMemOperand());
10654}
10655
10656SDValue
10657SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10658 unsigned NewOpcode) const {
10659 SDLoc DL(Op);
10660
10661 SDValue VData = Op.getOperand(2);
10662 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10663 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10664 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10665 SDValue Ops[] = {
10666 Op.getOperand(0), // Chain
10667 VData, // vdata
10668 Rsrc, // rsrc
10669 Op.getOperand(4), // vindex
10670 VOffset, // voffset
10671 SOffset, // soffset
10672 Offset, // offset
10673 Op.getOperand(7), // cachepolicy
10674 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10675 };
10676
10677 auto *M = cast<MemSDNode>(Op);
10678
10679 EVT MemVT = VData.getValueType();
10680 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10681 M->getMemOperand());
10682}
10683
10684SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10685 SelectionDAG &DAG) const {
10686 unsigned IntrID = Op.getConstantOperandVal(1);
10687 SDLoc DL(Op);
10688
10689 switch (IntrID) {
10690 case Intrinsic::amdgcn_ds_ordered_add:
10691 case Intrinsic::amdgcn_ds_ordered_swap: {
10692 MemSDNode *M = cast<MemSDNode>(Op);
10693 SDValue Chain = M->getOperand(0);
10694 SDValue M0 = M->getOperand(2);
10695 SDValue Value = M->getOperand(3);
10696 unsigned IndexOperand = M->getConstantOperandVal(7);
10697 unsigned WaveRelease = M->getConstantOperandVal(8);
10698 unsigned WaveDone = M->getConstantOperandVal(9);
10699
10700 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10701 IndexOperand &= ~0x3f;
10702 unsigned CountDw = 0;
10703
10704 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10705 CountDw = (IndexOperand >> 24) & 0xf;
10706 IndexOperand &= ~(0xf << 24);
10707
10708 if (CountDw < 1 || CountDw > 4) {
10709 const Function &Fn = DAG.getMachineFunction().getFunction();
10710 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10711 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10712 DL.getDebugLoc()));
10713 CountDw = 1;
10714 }
10715 }
10716
10717 if (IndexOperand) {
10718 const Function &Fn = DAG.getMachineFunction().getFunction();
10719 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10720 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10721 }
10722
10723 if (WaveDone && !WaveRelease) {
10724 // TODO: Move this to IR verifier
10725 const Function &Fn = DAG.getMachineFunction().getFunction();
10726 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10727 Fn, "ds_ordered_count: wave_done requires wave_release",
10728 DL.getDebugLoc()));
10729 }
10730
10731 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10732 unsigned ShaderType =
10734 unsigned Offset0 = OrderedCountIndex << 2;
10735 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10736
10737 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10738 Offset1 |= (CountDw - 1) << 6;
10739
10740 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10741 Offset1 |= ShaderType << 2;
10742
10743 unsigned Offset = Offset0 | (Offset1 << 8);
10744
10745 SDValue Ops[] = {
10746 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10747 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10748 };
10749 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10750 M->getVTList(), Ops, M->getMemoryVT(),
10751 M->getMemOperand());
10752 }
10753 case Intrinsic::amdgcn_raw_buffer_load:
10754 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10755 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10756 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10757 case Intrinsic::amdgcn_raw_buffer_load_format:
10758 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10759 const bool IsFormat =
10760 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10761 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10762
10763 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10764 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10765 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10766 SDValue Ops[] = {
10767 Op.getOperand(0), // Chain
10768 Rsrc, // rsrc
10769 DAG.getConstant(0, DL, MVT::i32), // vindex
10770 VOffset, // voffset
10771 SOffset, // soffset
10772 Offset, // offset
10773 Op.getOperand(5), // cachepolicy, swizzled buffer
10774 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10775 };
10776
10777 auto *M = cast<MemSDNode>(Op);
10778 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10779 }
10780 case Intrinsic::amdgcn_struct_buffer_load:
10781 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10782 case Intrinsic::amdgcn_struct_buffer_load_format:
10783 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10784 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10785 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10786 const bool IsFormat =
10787 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10788 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10789
10790 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10791 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10792 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10793 SDValue Ops[] = {
10794 Op.getOperand(0), // Chain
10795 Rsrc, // rsrc
10796 Op.getOperand(3), // vindex
10797 VOffset, // voffset
10798 SOffset, // soffset
10799 Offset, // offset
10800 Op.getOperand(6), // cachepolicy, swizzled buffer
10801 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10802 };
10803
10804 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10805 }
10806 case Intrinsic::amdgcn_raw_tbuffer_load:
10807 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10808 MemSDNode *M = cast<MemSDNode>(Op);
10809 EVT LoadVT = Op.getValueType();
10810 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10811 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10812 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10813
10814 SDValue Ops[] = {
10815 Op.getOperand(0), // Chain
10816 Rsrc, // rsrc
10817 DAG.getConstant(0, DL, MVT::i32), // vindex
10818 VOffset, // voffset
10819 SOffset, // soffset
10820 Offset, // offset
10821 Op.getOperand(5), // format
10822 Op.getOperand(6), // cachepolicy, swizzled buffer
10823 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10824 };
10825
10826 if (LoadVT.getScalarType() == MVT::f16)
10827 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10828 Ops);
10829 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10830 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10831 DAG);
10832 }
10833 case Intrinsic::amdgcn_struct_tbuffer_load:
10834 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10835 MemSDNode *M = cast<MemSDNode>(Op);
10836 EVT LoadVT = Op.getValueType();
10837 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10838 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10839 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10840
10841 SDValue Ops[] = {
10842 Op.getOperand(0), // Chain
10843 Rsrc, // rsrc
10844 Op.getOperand(3), // vindex
10845 VOffset, // voffset
10846 SOffset, // soffset
10847 Offset, // offset
10848 Op.getOperand(6), // format
10849 Op.getOperand(7), // cachepolicy, swizzled buffer
10850 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10851 };
10852
10853 if (LoadVT.getScalarType() == MVT::f16)
10854 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10855 Ops);
10856 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10857 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10858 DAG);
10859 }
10860 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10861 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10862 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10863 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10864 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10865 return lowerStructBufferAtomicIntrin(Op, DAG,
10866 AMDGPUISD::BUFFER_ATOMIC_FADD);
10867 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10868 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10869 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10870 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10871 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10872 return lowerStructBufferAtomicIntrin(Op, DAG,
10873 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10874 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10875 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10876 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10877 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10878 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10879 return lowerStructBufferAtomicIntrin(Op, DAG,
10880 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10881 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10883 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10884 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10885 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10886 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10887 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10889 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10890 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10891 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10892 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10893 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10895 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10896 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10898 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10899 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10901 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10902 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10904 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10905 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10907 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10908 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10910 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10911 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10913 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10914 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10916 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10917 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10918 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10919 return lowerStructBufferAtomicIntrin(Op, DAG,
10920 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10921 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10922 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10923 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10924 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10925 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10926 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10927 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10929 return lowerStructBufferAtomicIntrin(Op, DAG,
10930 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10931 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10933 return lowerStructBufferAtomicIntrin(Op, DAG,
10934 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10935 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10937 return lowerStructBufferAtomicIntrin(Op, DAG,
10938 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10939 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10941 return lowerStructBufferAtomicIntrin(Op, DAG,
10942 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10943 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10945 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10946 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10947 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10948 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10949 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10950 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10951 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10952 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10954 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10955 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10956 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10957 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10958 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10959 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10960 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10961 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10963 return lowerStructBufferAtomicIntrin(Op, DAG,
10964 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10965 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10966 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10967 return lowerRawBufferAtomicIntrin(Op, DAG,
10968 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10969 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10970 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10971 return lowerStructBufferAtomicIntrin(Op, DAG,
10972 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10973 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10974 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10975 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10976 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10977 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10978 SDValue Ops[] = {
10979 Op.getOperand(0), // Chain
10980 Op.getOperand(2), // src
10981 Op.getOperand(3), // cmp
10982 Rsrc, // rsrc
10983 DAG.getConstant(0, DL, MVT::i32), // vindex
10984 VOffset, // voffset
10985 SOffset, // soffset
10986 Offset, // offset
10987 Op.getOperand(7), // cachepolicy
10988 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10989 };
10990 EVT VT = Op.getValueType();
10991 auto *M = cast<MemSDNode>(Op);
10992
10993 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10994 Op->getVTList(), Ops, VT,
10995 M->getMemOperand());
10996 }
10997 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10998 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10999 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
11000 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
11001 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
11002 SDValue Ops[] = {
11003 Op.getOperand(0), // Chain
11004 Op.getOperand(2), // src
11005 Op.getOperand(3), // cmp
11006 Rsrc, // rsrc
11007 Op.getOperand(5), // vindex
11008 VOffset, // voffset
11009 SOffset, // soffset
11010 Offset, // offset
11011 Op.getOperand(8), // cachepolicy
11012 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11013 };
11014 EVT VT = Op.getValueType();
11015 auto *M = cast<MemSDNode>(Op);
11016
11017 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11018 Op->getVTList(), Ops, VT,
11019 M->getMemOperand());
11020 }
11021 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11022 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11023 MemSDNode *M = cast<MemSDNode>(Op);
11024 SDValue NodePtr = M->getOperand(2);
11025 SDValue RayExtent = M->getOperand(3);
11026 SDValue InstanceMask = M->getOperand(4);
11027 SDValue RayOrigin = M->getOperand(5);
11028 SDValue RayDir = M->getOperand(6);
11029 SDValue Offsets = M->getOperand(7);
11030 SDValue TDescr = M->getOperand(8);
11031
11032 assert(NodePtr.getValueType() == MVT::i64);
11033 assert(RayDir.getValueType() == MVT::v3f32);
11034
11035 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11036 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11037 return SDValue();
11038 }
11039
11040 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11041 const unsigned NumVDataDwords = 10;
11042 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11043 int Opcode = AMDGPU::getMIMGOpcode(
11044 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11045 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11046 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11047 assert(Opcode != -1);
11048
11050 Ops.push_back(NodePtr);
11051 Ops.push_back(DAG.getBuildVector(
11052 MVT::v2i32, DL,
11053 {DAG.getBitcast(MVT::i32, RayExtent),
11054 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11055 Ops.push_back(RayOrigin);
11056 Ops.push_back(RayDir);
11057 Ops.push_back(Offsets);
11058 Ops.push_back(TDescr);
11059 Ops.push_back(M->getChain());
11060
11061 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11062 MachineMemOperand *MemRef = M->getMemOperand();
11063 DAG.setNodeMemRefs(NewNode, {MemRef});
11064 return SDValue(NewNode, 0);
11065 }
11066 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11067 MemSDNode *M = cast<MemSDNode>(Op);
11068 SDValue NodePtr = M->getOperand(2);
11069 SDValue RayExtent = M->getOperand(3);
11070 SDValue RayOrigin = M->getOperand(4);
11071 SDValue RayDir = M->getOperand(5);
11072 SDValue RayInvDir = M->getOperand(6);
11073 SDValue TDescr = M->getOperand(7);
11074
11075 assert(NodePtr.getValueType() == MVT::i32 ||
11076 NodePtr.getValueType() == MVT::i64);
11077 assert(RayDir.getValueType() == MVT::v3f16 ||
11078 RayDir.getValueType() == MVT::v3f32);
11079
11080 if (!Subtarget->hasGFX10_AEncoding()) {
11081 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11082 return SDValue();
11083 }
11084
11085 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
11086 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
11087 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11088 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11089 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11090 const unsigned NumVDataDwords = 4;
11091 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11092 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11093 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11094 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11095 IsGFX12Plus;
11096 const unsigned BaseOpcodes[2][2] = {
11097 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11098 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11099 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11100 int Opcode;
11101 if (UseNSA) {
11102 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11103 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11104 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11105 : AMDGPU::MIMGEncGfx10NSA,
11106 NumVDataDwords, NumVAddrDwords);
11107 } else {
11108 assert(!IsGFX12Plus);
11109 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11110 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11111 : AMDGPU::MIMGEncGfx10Default,
11112 NumVDataDwords, NumVAddrDwords);
11113 }
11114 assert(Opcode != -1);
11115
11117
11118 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11120 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
11121 if (Lanes[0].getValueSizeInBits() == 32) {
11122 for (unsigned I = 0; I < 3; ++I)
11123 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
11124 } else {
11125 if (IsAligned) {
11126 Ops.push_back(DAG.getBitcast(
11127 MVT::i32,
11128 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
11129 Ops.push_back(Lanes[2]);
11130 } else {
11131 SDValue Elt0 = Ops.pop_back_val();
11132 Ops.push_back(DAG.getBitcast(
11133 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
11134 Ops.push_back(DAG.getBitcast(
11135 MVT::i32,
11136 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
11137 }
11138 }
11139 };
11140
11141 if (UseNSA && IsGFX11Plus) {
11142 Ops.push_back(NodePtr);
11143 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11144 Ops.push_back(RayOrigin);
11145 if (IsA16) {
11146 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11147 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
11148 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
11149 for (unsigned I = 0; I < 3; ++I) {
11150 MergedLanes.push_back(DAG.getBitcast(
11151 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
11152 {DirLanes[I], InvDirLanes[I]})));
11153 }
11154 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
11155 } else {
11156 Ops.push_back(RayDir);
11157 Ops.push_back(RayInvDir);
11158 }
11159 } else {
11160 if (Is64)
11161 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
11162 2);
11163 else
11164 Ops.push_back(NodePtr);
11165
11166 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11167 packLanes(RayOrigin, true);
11168 packLanes(RayDir, true);
11169 packLanes(RayInvDir, false);
11170 }
11171
11172 if (!UseNSA) {
11173 // Build a single vector containing all the operands so far prepared.
11174 if (NumVAddrDwords > 12) {
11175 SDValue Undef = DAG.getPOISON(MVT::i32);
11176 Ops.append(16 - Ops.size(), Undef);
11177 }
11178 assert(Ops.size() >= 8 && Ops.size() <= 12);
11179 SDValue MergedOps =
11180 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
11181 Ops.clear();
11182 Ops.push_back(MergedOps);
11183 }
11184
11185 Ops.push_back(TDescr);
11186 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
11187 Ops.push_back(M->getChain());
11188
11189 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11190 MachineMemOperand *MemRef = M->getMemOperand();
11191 DAG.setNodeMemRefs(NewNode, {MemRef});
11192 return SDValue(NewNode, 0);
11193 }
11194 case Intrinsic::amdgcn_global_atomic_fmin_num:
11195 case Intrinsic::amdgcn_global_atomic_fmax_num:
11196 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11197 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11198 MemSDNode *M = cast<MemSDNode>(Op);
11199 SDValue Ops[] = {
11200 M->getOperand(0), // Chain
11201 M->getOperand(2), // Ptr
11202 M->getOperand(3) // Value
11203 };
11204 unsigned Opcode = 0;
11205 switch (IntrID) {
11206 case Intrinsic::amdgcn_global_atomic_fmin_num:
11207 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11208 Opcode = ISD::ATOMIC_LOAD_FMIN;
11209 break;
11210 }
11211 case Intrinsic::amdgcn_global_atomic_fmax_num:
11212 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11213 Opcode = ISD::ATOMIC_LOAD_FMAX;
11214 break;
11215 }
11216 default:
11217 llvm_unreachable("unhandled atomic opcode");
11218 }
11219 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
11220 Ops, M->getMemOperand());
11221 }
11222 case Intrinsic::amdgcn_s_alloc_vgpr: {
11223 SDValue NumVGPRs = Op.getOperand(2);
11224 if (!NumVGPRs->isDivergent())
11225 return Op;
11226
11227 SDValue ReadFirstLaneID =
11228 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
11229 NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
11230 ReadFirstLaneID, NumVGPRs);
11231
11232 return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
11233 Op.getOperand(0), Op.getOperand(1), NumVGPRs);
11234 }
11235 case Intrinsic::amdgcn_s_get_barrier_state:
11236 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11237 SDValue Chain = Op->getOperand(0);
11239 unsigned Opc;
11240
11241 if (isa<ConstantSDNode>(Op->getOperand(2))) {
11242 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
11243 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11244 BarID = (BarID >> 4) & 0x3F;
11245 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11246 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11247 Ops.push_back(K);
11248 Ops.push_back(Chain);
11249 } else {
11250 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11251 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11252 SDValue M0Val;
11253 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
11254 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11255 M0Val = SDValue(
11256 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11257 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11258 0);
11259 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11260 } else
11261 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
11262 }
11263
11264 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11265 return SDValue(NewMI, 0);
11266 }
11267 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11268 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11269 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11270 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11271 SDValue Chain = Op->getOperand(0);
11272 SDValue Ptr = Op->getOperand(2);
11273 EVT VT = Op->getValueType(0);
11274 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
11275 Chain, Ptr, MII->getMemOperand());
11276 }
11277 case Intrinsic::amdgcn_flat_load_monitor_b32:
11278 case Intrinsic::amdgcn_flat_load_monitor_b64:
11279 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11280 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11281 SDValue Chain = Op->getOperand(0);
11282 SDValue Ptr = Op->getOperand(2);
11283 return DAG.getMemIntrinsicNode(AMDGPUISD::FLAT_LOAD_MONITOR, DL,
11284 Op->getVTList(), {Chain, Ptr},
11285 MII->getMemoryVT(), MII->getMemOperand());
11286 }
11287 case Intrinsic::amdgcn_global_load_monitor_b32:
11288 case Intrinsic::amdgcn_global_load_monitor_b64:
11289 case Intrinsic::amdgcn_global_load_monitor_b128: {
11290 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11291 SDValue Chain = Op->getOperand(0);
11292 SDValue Ptr = Op->getOperand(2);
11293 return DAG.getMemIntrinsicNode(AMDGPUISD::GLOBAL_LOAD_MONITOR, DL,
11294 Op->getVTList(), {Chain, Ptr},
11295 MII->getMemoryVT(), MII->getMemOperand());
11296 }
11297 default:
11298
11299 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11301 return lowerImage(Op, ImageDimIntr, DAG, true);
11302
11303 return SDValue();
11304 }
11305}
11306
11307// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11308// dwordx4 if on SI and handle TFE loads.
11309SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11310 SDVTList VTList,
11311 ArrayRef<SDValue> Ops, EVT MemVT,
11312 MachineMemOperand *MMO,
11313 SelectionDAG &DAG) const {
11314 LLVMContext &C = *DAG.getContext();
11315 MachineFunction &MF = DAG.getMachineFunction();
11316 EVT VT = VTList.VTs[0];
11317
11318 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11319 bool IsTFE = VTList.NumVTs == 3;
11320 if (IsTFE) {
11321 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
11322 unsigned NumOpDWords = NumValueDWords + 1;
11323 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
11324 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
11325 MachineMemOperand *OpDWordsMMO =
11326 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
11327 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
11328 OpDWordsVT, OpDWordsMMO, DAG);
11329 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11330 DAG.getVectorIdxConstant(NumValueDWords, DL));
11331 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
11332 SDValue ValueDWords =
11333 NumValueDWords == 1
11334 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
11336 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
11337 ZeroIdx);
11338 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
11339 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11340 }
11341
11342 if (!Subtarget->hasDwordx3LoadStores() &&
11343 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11344 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
11345 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
11346 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
11347 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
11348 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
11349 WidenedMemVT, WidenedMMO);
11351 DAG.getVectorIdxConstant(0, DL));
11352 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
11353 }
11354
11355 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
11356}
11357
11358SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11359 bool ImageStore) const {
11360 EVT StoreVT = VData.getValueType();
11361
11362 // No change for f16 and legal vector D16 types.
11363 if (!StoreVT.isVector())
11364 return VData;
11365
11366 SDLoc DL(VData);
11367 unsigned NumElements = StoreVT.getVectorNumElements();
11368
11369 if (Subtarget->hasUnpackedD16VMem()) {
11370 // We need to unpack the packed data to store.
11371 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11372 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11373
11374 EVT EquivStoreVT =
11375 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11376 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11377 return DAG.UnrollVectorOp(ZExt.getNode());
11378 }
11379
11380 // The sq block of gfx8.1 does not estimate register use correctly for d16
11381 // image store instructions. The data operand is computed as if it were not a
11382 // d16 image instruction.
11383 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11384 // Bitcast to i16
11385 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11386 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11387
11388 // Decompose into scalars
11390 DAG.ExtractVectorElements(IntVData, Elts);
11391
11392 // Group pairs of i16 into v2i16 and bitcast to i32
11393 SmallVector<SDValue, 4> PackedElts;
11394 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11395 SDValue Pair =
11396 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11397 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11398 PackedElts.push_back(IntPair);
11399 }
11400 if ((NumElements % 2) == 1) {
11401 // Handle v3i16
11402 unsigned I = Elts.size() / 2;
11403 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11404 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11405 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11406 PackedElts.push_back(IntPair);
11407 }
11408
11409 // Pad using UNDEF
11410 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11411
11412 // Build final vector
11413 EVT VecVT =
11414 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11415 return DAG.getBuildVector(VecVT, DL, PackedElts);
11416 }
11417
11418 if (NumElements == 3) {
11419 EVT IntStoreVT =
11421 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11422
11423 EVT WidenedStoreVT = EVT::getVectorVT(
11424 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11425 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11426 WidenedStoreVT.getStoreSizeInBits());
11427 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11428 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11429 }
11430
11431 assert(isTypeLegal(StoreVT));
11432 return VData;
11433}
11434
11435static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
11436 switch (Intr) {
11437 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11438 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11439 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11440 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11441 case Intrinsic::amdgcn_load_async_to_lds:
11442 case Intrinsic::amdgcn_global_load_async_lds:
11443 return true;
11444 }
11445 return false;
11446}
11447
11448SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11449 SelectionDAG &DAG) const {
11450 SDLoc DL(Op);
11451 SDValue Chain = Op.getOperand(0);
11452 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11453
11454 switch (IntrinsicID) {
11455 case Intrinsic::amdgcn_exp_compr: {
11456 if (!Subtarget->hasCompressedExport()) {
11457 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11459 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11460 }
11461 SDValue Src0 = Op.getOperand(4);
11462 SDValue Src1 = Op.getOperand(5);
11463 // Hack around illegal type on SI by directly selecting it.
11464 if (isTypeLegal(Src0.getValueType()))
11465 return SDValue();
11466
11467 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11468 SDValue Undef = DAG.getPOISON(MVT::f32);
11469 const SDValue Ops[] = {
11470 Op.getOperand(2), // tgt
11471 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11472 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11473 Undef, // src2
11474 Undef, // src3
11475 Op.getOperand(7), // vm
11476 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11477 Op.getOperand(3), // en
11478 Op.getOperand(0) // Chain
11479 };
11480
11481 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11482 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11483 }
11484
11485 case Intrinsic::amdgcn_struct_tbuffer_store:
11486 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11487 SDValue VData = Op.getOperand(2);
11488 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11489 if (IsD16)
11490 VData = handleD16VData(VData, DAG);
11491 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11492 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11493 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11494 SDValue Ops[] = {
11495 Chain,
11496 VData, // vdata
11497 Rsrc, // rsrc
11498 Op.getOperand(4), // vindex
11499 VOffset, // voffset
11500 SOffset, // soffset
11501 Offset, // offset
11502 Op.getOperand(7), // format
11503 Op.getOperand(8), // cachepolicy, swizzled buffer
11504 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11505 };
11506 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11507 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11508 MemSDNode *M = cast<MemSDNode>(Op);
11509 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11510 M->getMemoryVT(), M->getMemOperand());
11511 }
11512
11513 case Intrinsic::amdgcn_raw_tbuffer_store:
11514 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11515 SDValue VData = Op.getOperand(2);
11516 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11517 if (IsD16)
11518 VData = handleD16VData(VData, DAG);
11519 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11520 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11521 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11522 SDValue Ops[] = {
11523 Chain,
11524 VData, // vdata
11525 Rsrc, // rsrc
11526 DAG.getConstant(0, DL, MVT::i32), // vindex
11527 VOffset, // voffset
11528 SOffset, // soffset
11529 Offset, // offset
11530 Op.getOperand(6), // format
11531 Op.getOperand(7), // cachepolicy, swizzled buffer
11532 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11533 };
11534 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11535 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11536 MemSDNode *M = cast<MemSDNode>(Op);
11537 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11538 M->getMemoryVT(), M->getMemOperand());
11539 }
11540
11541 case Intrinsic::amdgcn_raw_buffer_store:
11542 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11543 case Intrinsic::amdgcn_raw_buffer_store_format:
11544 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11545 const bool IsFormat =
11546 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11547 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11548
11549 SDValue VData = Op.getOperand(2);
11550 EVT VDataVT = VData.getValueType();
11551 EVT EltType = VDataVT.getScalarType();
11552 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11553 if (IsD16) {
11554 VData = handleD16VData(VData, DAG);
11555 VDataVT = VData.getValueType();
11556 }
11557
11558 if (!isTypeLegal(VDataVT)) {
11559 VData =
11560 DAG.getNode(ISD::BITCAST, DL,
11561 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11562 }
11563
11564 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11565 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11566 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11567 SDValue Ops[] = {
11568 Chain,
11569 VData,
11570 Rsrc,
11571 DAG.getConstant(0, DL, MVT::i32), // vindex
11572 VOffset, // voffset
11573 SOffset, // soffset
11574 Offset, // offset
11575 Op.getOperand(6), // cachepolicy, swizzled buffer
11576 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11577 };
11578 unsigned Opc =
11579 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11580 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11581 MemSDNode *M = cast<MemSDNode>(Op);
11582
11583 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11584 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11585 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11586
11587 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11588 M->getMemoryVT(), M->getMemOperand());
11589 }
11590
11591 case Intrinsic::amdgcn_struct_buffer_store:
11592 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11593 case Intrinsic::amdgcn_struct_buffer_store_format:
11594 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11595 const bool IsFormat =
11596 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11597 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11598
11599 SDValue VData = Op.getOperand(2);
11600 EVT VDataVT = VData.getValueType();
11601 EVT EltType = VDataVT.getScalarType();
11602 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11603
11604 if (IsD16) {
11605 VData = handleD16VData(VData, DAG);
11606 VDataVT = VData.getValueType();
11607 }
11608
11609 if (!isTypeLegal(VDataVT)) {
11610 VData =
11611 DAG.getNode(ISD::BITCAST, DL,
11612 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11613 }
11614
11615 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11616 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11617 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11618 SDValue Ops[] = {
11619 Chain,
11620 VData,
11621 Rsrc,
11622 Op.getOperand(4), // vindex
11623 VOffset, // voffset
11624 SOffset, // soffset
11625 Offset, // offset
11626 Op.getOperand(7), // cachepolicy, swizzled buffer
11627 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11628 };
11629 unsigned Opc =
11630 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11631 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11632 MemSDNode *M = cast<MemSDNode>(Op);
11633
11634 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11635 EVT VDataType = VData.getValueType().getScalarType();
11636 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11637 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11638
11639 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11640 M->getMemoryVT(), M->getMemOperand());
11641 }
11642 case Intrinsic::amdgcn_raw_buffer_load_lds:
11643 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11644 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11645 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11646 case Intrinsic::amdgcn_struct_buffer_load_lds:
11647 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11648 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
11649 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
11650 if (!Subtarget->hasVMemToLDSLoad())
11651 return SDValue();
11652 unsigned Opc;
11653 bool HasVIndex =
11654 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11655 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
11656 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
11657 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
11658 unsigned OpOffset = HasVIndex ? 1 : 0;
11659 SDValue VOffset = Op.getOperand(5 + OpOffset);
11660 bool HasVOffset = !isNullConstant(VOffset);
11661 unsigned Size = Op->getConstantOperandVal(4);
11662
11663 switch (Size) {
11664 default:
11665 return SDValue();
11666 case 1:
11667 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11668 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11669 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11670 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11671 break;
11672 case 2:
11673 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11674 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11675 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11676 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11677 break;
11678 case 4:
11679 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11680 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11681 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11682 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11683 break;
11684 case 12:
11685 if (!Subtarget->hasLDSLoadB96_B128())
11686 return SDValue();
11687 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11688 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11689 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11690 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11691 break;
11692 case 16:
11693 if (!Subtarget->hasLDSLoadB96_B128())
11694 return SDValue();
11695 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11696 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11697 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11698 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11699 break;
11700 }
11701
11702 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11703
11705
11706 if (HasVIndex && HasVOffset)
11707 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11708 {Op.getOperand(5), // VIndex
11709 VOffset}));
11710 else if (HasVIndex)
11711 Ops.push_back(Op.getOperand(5));
11712 else if (HasVOffset)
11713 Ops.push_back(VOffset);
11714
11715 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11716 Ops.push_back(Rsrc);
11717 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11718 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11719 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11720 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11721 Ops.push_back(DAG.getTargetConstant(
11722 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11723 DL, MVT::i8)); // cpol
11724 Ops.push_back(DAG.getTargetConstant(
11725 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11726 ? 1
11727 : 0,
11728 DL, MVT::i8)); // swz
11729 Ops.push_back(
11730 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
11731 Ops.push_back(M0Val.getValue(0)); // Chain
11732 Ops.push_back(M0Val.getValue(1)); // Glue
11733
11734 auto *M = cast<MemSDNode>(Op);
11735 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11736 DAG.setNodeMemRefs(Load, M->memoperands());
11737
11738 return SDValue(Load, 0);
11739 }
11740 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11741 // for "trust me" that the remaining cases are global pointers until
11742 // such time as we can put two mem operands on an intrinsic.
11743 case Intrinsic::amdgcn_load_to_lds:
11744 case Intrinsic::amdgcn_load_async_to_lds:
11745 case Intrinsic::amdgcn_global_load_lds:
11746 case Intrinsic::amdgcn_global_load_async_lds: {
11747 if (!Subtarget->hasVMemToLDSLoad())
11748 return SDValue();
11749
11750 unsigned Opc;
11751 unsigned Size = Op->getConstantOperandVal(4);
11752 switch (Size) {
11753 default:
11754 return SDValue();
11755 case 1:
11756 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11757 break;
11758 case 2:
11759 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11760 break;
11761 case 4:
11762 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11763 break;
11764 case 12:
11765 if (!Subtarget->hasLDSLoadB96_B128())
11766 return SDValue();
11767 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11768 break;
11769 case 16:
11770 if (!Subtarget->hasLDSLoadB96_B128())
11771 return SDValue();
11772 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11773 break;
11774 }
11775
11776 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11777
11779
11780 SDValue Addr = Op.getOperand(2); // Global ptr
11781 SDValue VOffset;
11782 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11783 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11784 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11785 SDValue LHS = Addr.getOperand(0);
11786 SDValue RHS = Addr.getOperand(1);
11787
11788 if (LHS->isDivergent())
11789 std::swap(LHS, RHS);
11790
11791 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11792 RHS.getOperand(0).getValueType() == MVT::i32) {
11793 // add (i64 sgpr), (zero_extend (i32 vgpr))
11794 Addr = LHS;
11795 VOffset = RHS.getOperand(0);
11796 }
11797 }
11798
11799 Ops.push_back(Addr);
11800 if (!Addr->isDivergent()) {
11802 if (!VOffset)
11803 VOffset =
11804 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11805 DAG.getTargetConstant(0, DL, MVT::i32)),
11806 0);
11807 Ops.push_back(VOffset);
11808 }
11809
11810 Ops.push_back(Op.getOperand(5)); // Offset
11811
11812 unsigned Aux = Op.getConstantOperandVal(6);
11813 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11814 MVT::i32)); // CPol
11815 Ops.push_back(
11816 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
11817
11818 Ops.push_back(M0Val.getValue(0)); // Chain
11819 Ops.push_back(M0Val.getValue(1)); // Glue
11820
11821 auto *M = cast<MemSDNode>(Op);
11822 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11823 DAG.setNodeMemRefs(Load, M->memoperands());
11824
11825 return SDValue(Load, 0);
11826 }
11827 case Intrinsic::amdgcn_end_cf:
11828 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11829 Op->getOperand(2), Chain),
11830 0);
11831 case Intrinsic::amdgcn_s_barrier_init:
11832 case Intrinsic::amdgcn_s_barrier_signal_var: {
11833 // these two intrinsics have two operands: barrier pointer and member count
11834 SDValue Chain = Op->getOperand(0);
11836 SDValue BarOp = Op->getOperand(2);
11837 SDValue CntOp = Op->getOperand(3);
11838 SDValue M0Val;
11839 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11840 ? AMDGPU::S_BARRIER_INIT_M0
11841 : AMDGPU::S_BARRIER_SIGNAL_M0;
11842 // extract the BarrierID from bits 4-9 of BarOp
11843 SDValue BarID;
11844 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11845 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11846 BarID =
11847 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11848 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11849 0);
11850 // Member count should be put into M0[ShAmt:+6]
11851 // Barrier ID should be put into M0[5:0]
11852 M0Val =
11853 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11854 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11855 0);
11856 constexpr unsigned ShAmt = 16;
11857 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11858 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11859
11860 M0Val = SDValue(
11861 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11862
11863 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11864
11865 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11866 return SDValue(NewMI, 0);
11867 }
11868 case Intrinsic::amdgcn_s_wakeup_barrier: {
11869 if (!Subtarget->hasSWakeupBarrier())
11870 return SDValue();
11871 [[fallthrough]];
11872 }
11873 case Intrinsic::amdgcn_s_barrier_join: {
11874 // these three intrinsics have one operand: barrier pointer
11875 SDValue Chain = Op->getOperand(0);
11877 SDValue BarOp = Op->getOperand(2);
11878 unsigned Opc;
11879
11880 if (isa<ConstantSDNode>(BarOp)) {
11881 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11882 switch (IntrinsicID) {
11883 default:
11884 return SDValue();
11885 case Intrinsic::amdgcn_s_barrier_join:
11886 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11887 break;
11888 case Intrinsic::amdgcn_s_wakeup_barrier:
11889 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11890 break;
11891 }
11892 // extract the BarrierID from bits 4-9 of the immediate
11893 unsigned BarID = (BarVal >> 4) & 0x3F;
11894 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11895 Ops.push_back(K);
11896 Ops.push_back(Chain);
11897 } else {
11898 switch (IntrinsicID) {
11899 default:
11900 return SDValue();
11901 case Intrinsic::amdgcn_s_barrier_join:
11902 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11903 break;
11904 case Intrinsic::amdgcn_s_wakeup_barrier:
11905 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11906 break;
11907 }
11908 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11909 SDValue M0Val;
11910 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11911 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11912 M0Val =
11913 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11914 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11915 0);
11916 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11917 }
11918
11919 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11920 return SDValue(NewMI, 0);
11921 }
11922 case Intrinsic::amdgcn_s_prefetch_data: {
11923 // For non-global address space preserve the chain and remove the call.
11925 return Op.getOperand(0);
11926 return Op;
11927 }
11928 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11929 SDValue Ops[] = {
11930 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11931 Op.getOperand(3), // offset
11932 Op.getOperand(4), // length
11933 };
11934
11935 MemSDNode *M = cast<MemSDNode>(Op);
11936 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
11937 Op->getVTList(), Ops, M->getMemoryVT(),
11938 M->getMemOperand());
11939 }
11940 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11941 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11942 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11943 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11944 SDValue Chain = Op->getOperand(0);
11945 SDValue Ptr = Op->getOperand(2);
11946 SDValue Val = Op->getOperand(3);
11947 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11948 Ptr, MII->getMemOperand());
11949 }
11950 default: {
11951 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11953 return lowerImage(Op, ImageDimIntr, DAG, true);
11954
11955 return Op;
11956 }
11957 }
11958}
11959
11960// Return whether the operation has NoUnsignedWrap property.
11961static bool isNoUnsignedWrap(SDValue Addr) {
11962 return (Addr.getOpcode() == ISD::ADD &&
11963 Addr->getFlags().hasNoUnsignedWrap()) ||
11964 Addr->getOpcode() == ISD::OR;
11965}
11966
11968 EVT PtrVT) const {
11969 return PtrVT == MVT::i64;
11970}
11971
11973 EVT PtrVT) const {
11974 return true;
11975}
11976
11977// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11978// offset (the offset that is included in bounds checking and swizzling, to be
11979// split between the instruction's voffset and immoffset fields) and soffset
11980// (the offset that is excluded from bounds checking and swizzling, to go in
11981// the instruction's soffset field). This function takes the first kind of
11982// offset and figures out how to split it between voffset and immoffset.
11983std::pair<SDValue, SDValue>
11984SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11985 SDLoc DL(Offset);
11986 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11987 SDValue N0 = Offset;
11988 ConstantSDNode *C1 = nullptr;
11989
11990 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11991 N0 = SDValue();
11992 else if (DAG.isBaseWithConstantOffset(N0)) {
11993 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11994 // being added, so we can only safely match a 32-bit addition with no
11995 // unsigned overflow.
11996 bool CheckNUW = Subtarget->hasGFX1250Insts();
11997 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11998 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11999 N0 = N0.getOperand(0);
12000 }
12001 }
12002
12003 if (C1) {
12004 unsigned ImmOffset = C1->getZExtValue();
12005 // If the immediate value is too big for the immoffset field, put only bits
12006 // that would normally fit in the immoffset field. The remaining value that
12007 // is copied/added for the voffset field is a large power of 2, and it
12008 // stands more chance of being CSEd with the copy/add for another similar
12009 // load/store.
12010 // However, do not do that rounding down if that is a negative
12011 // number, as it appears to be illegal to have a negative offset in the
12012 // vgpr, even if adding the immediate offset makes it positive.
12013 unsigned Overflow = ImmOffset & ~MaxImm;
12014 ImmOffset -= Overflow;
12015 if ((int32_t)Overflow < 0) {
12016 Overflow += ImmOffset;
12017 ImmOffset = 0;
12018 }
12019 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
12020 if (Overflow) {
12021 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
12022 if (!N0)
12023 N0 = OverflowVal;
12024 else {
12025 SDValue Ops[] = {N0, OverflowVal};
12026 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
12027 }
12028 }
12029 }
12030 if (!N0)
12031 N0 = DAG.getConstant(0, DL, MVT::i32);
12032 if (!C1)
12033 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
12034 return {N0, SDValue(C1, 0)};
12035}
12036
12037// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
12038// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
12039// pointed to by Offsets.
12040void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
12041 SelectionDAG &DAG, SDValue *Offsets,
12042 Align Alignment) const {
12043 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12044 SDLoc DL(CombinedOffset);
12045 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
12046 uint32_t Imm = C->getZExtValue();
12047 uint32_t SOffset, ImmOffset;
12048 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12049 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
12050 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12051 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12052 return;
12053 }
12054 }
12055 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
12056 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12057 // being added, so we can only safely match a 32-bit addition with no
12058 // unsigned overflow.
12059 bool CheckNUW = Subtarget->hasGFX1250Insts();
12060 SDValue N0 = CombinedOffset.getOperand(0);
12061 SDValue N1 = CombinedOffset.getOperand(1);
12062 uint32_t SOffset, ImmOffset;
12063 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
12064 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) &&
12065 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
12066 Offsets[0] = N0;
12067 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12068 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12069 return;
12070 }
12071 }
12072
12073 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12074 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
12075 : DAG.getConstant(0, DL, MVT::i32);
12076
12077 Offsets[0] = CombinedOffset;
12078 Offsets[1] = SOffsetZero;
12079 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
12080}
12081
12082SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12083 SelectionDAG &DAG) const {
12084 if (!MaybePointer.getValueType().isScalarInteger())
12085 return MaybePointer;
12086
12087 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
12088 return Rsrc;
12089}
12090
12091// Wrap a global or flat pointer into a buffer intrinsic using the flags
12092// specified in the intrinsic.
12093SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12094 SelectionDAG &DAG) const {
12095 SDLoc Loc(Op);
12096
12097 SDValue Pointer = Op->getOperand(1);
12098 SDValue Stride = Op->getOperand(2);
12099 SDValue NumRecords = Op->getOperand(3);
12100 SDValue Flags = Op->getOperand(4);
12101
12102 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
12103 SDValue Rsrc;
12104
12105 if (Subtarget->has45BitNumRecordsBufferResource()) {
12106 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
12107 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12108 // num_records.
12109 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
12110 SDValue NumRecordsLHS =
12111 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
12112 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
12113 SDValue LowHalf =
12114 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
12115
12116 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12117 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12118 SDValue NumRecordsRHS =
12119 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
12120 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
12121 SDValue ShiftedStride =
12122 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12123 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
12124 SDValue ExtShiftedStrideVec =
12125 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
12126 SDValue ExtShiftedStride =
12127 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
12128 SDValue ShiftedFlags =
12129 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
12130 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
12131 SDValue ExtShiftedFlagsVec =
12132 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
12133 SDValue ExtShiftedFlags =
12134 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
12135 SDValue CombinedFields =
12136 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12137 SDValue HighHalf =
12138 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12139
12140 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
12141 } else {
12142 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
12143 auto [LowHalf, HighHalf] =
12144 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12145 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
12146 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
12147 SDValue ShiftedStride =
12148 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12149 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
12150 SDValue NewHighHalf =
12151 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
12152
12153 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
12154 NumRecords, Flags);
12155 }
12156
12157 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
12158 return RsrcPtr;
12159}
12160
12161// Handle 8 bit and 16 bit buffer loads
12162SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12163 EVT LoadVT, SDLoc DL,
12165 MachineMemOperand *MMO,
12166 bool IsTFE) const {
12167 EVT IntVT = LoadVT.changeTypeToInteger();
12168
12169 if (IsTFE) {
12170 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12171 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12172 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12173 MachineFunction &MF = DAG.getMachineFunction();
12174 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
12175 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
12176 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
12177 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12178 DAG.getConstant(1, DL, MVT::i32));
12179 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12180 DAG.getConstant(0, DL, MVT::i32));
12181 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
12182 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
12183 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
12184 }
12185
12186 unsigned Opc = LoadVT.getScalarType() == MVT::i8
12187 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12188 : AMDGPUISD::BUFFER_LOAD_USHORT;
12189
12190 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
12191 SDValue BufferLoad =
12192 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
12193 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
12194 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
12195
12196 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
12197}
12198
12199// Handle 8 bit and 16 bit buffer stores
12200SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12201 EVT VDataType, SDLoc DL,
12202 SDValue Ops[],
12203 MemSDNode *M) const {
12204 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12205 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
12206
12207 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
12208 Ops[1] = BufferStoreExt;
12209 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12210 : AMDGPUISD::BUFFER_STORE_SHORT;
12211 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
12212 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
12213 M->getMemOperand());
12214}
12215
12217 SDValue Op, const SDLoc &SL, EVT VT) {
12218 if (VT.bitsLT(Op.getValueType()))
12219 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
12220
12221 switch (ExtType) {
12222 case ISD::SEXTLOAD:
12223 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
12224 case ISD::ZEXTLOAD:
12225 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
12226 case ISD::EXTLOAD:
12227 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
12228 case ISD::NON_EXTLOAD:
12229 return Op;
12230 }
12231
12232 llvm_unreachable("invalid ext type");
12233}
12234
12235// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
12236// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
12237SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
12238 DAGCombinerInfo &DCI) const {
12239 SelectionDAG &DAG = DCI.DAG;
12240 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
12241 return SDValue();
12242
12243 // FIXME: Constant loads should all be marked invariant.
12244 unsigned AS = Ld->getAddressSpace();
12245 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
12247 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
12248 return SDValue();
12249
12250 // Don't do this early, since it may interfere with adjacent load merging for
12251 // illegal types. We can avoid losing alignment information for exotic types
12252 // pre-legalize.
12253 EVT MemVT = Ld->getMemoryVT();
12254 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
12255 MemVT.getSizeInBits() >= 32)
12256 return SDValue();
12257
12258 SDLoc SL(Ld);
12259
12260 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
12261 "unexpected vector extload");
12262
12263 // TODO: Drop only high part of range.
12264 SDValue Ptr = Ld->getBasePtr();
12265 SDValue NewLoad = DAG.getLoad(
12266 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
12267 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
12268 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
12269 nullptr); // Drop ranges
12270
12271 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
12272 if (MemVT.isFloatingPoint()) {
12274 "unexpected fp extload");
12275 TruncVT = MemVT.changeTypeToInteger();
12276 }
12277
12278 SDValue Cvt = NewLoad;
12279 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12280 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
12281 DAG.getValueType(TruncVT));
12282 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12284 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
12285 } else {
12287 }
12288
12289 EVT VT = Ld->getValueType(0);
12290 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
12291
12292 DCI.AddToWorklist(Cvt.getNode());
12293
12294 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12295 // the appropriate extension from the 32-bit load.
12296 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
12297 DCI.AddToWorklist(Cvt.getNode());
12298
12299 // Handle conversion back to floating point if necessary.
12300 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
12301
12302 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
12303}
12304
12306 const SIMachineFunctionInfo &Info) {
12307 // TODO: Should check if the address can definitely not access stack.
12308 if (Info.isEntryFunction())
12309 return Info.getUserSGPRInfo().hasFlatScratchInit();
12310 return true;
12311}
12312
12313SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12314 SDLoc DL(Op);
12315 LoadSDNode *Load = cast<LoadSDNode>(Op);
12316 ISD::LoadExtType ExtType = Load->getExtensionType();
12317 EVT MemVT = Load->getMemoryVT();
12318 MachineMemOperand *MMO = Load->getMemOperand();
12319
12320 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12321 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
12322 return SDValue();
12323
12324 // FIXME: Copied from PPC
12325 // First, load into 32 bits, then truncate to 1 bit.
12326
12327 SDValue Chain = Load->getChain();
12328 SDValue BasePtr = Load->getBasePtr();
12329
12330 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12331
12332 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
12333 RealMemVT, MMO);
12334
12335 if (!MemVT.isVector()) {
12336 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
12337 NewLD.getValue(1)};
12338
12339 return DAG.getMergeValues(Ops, DL);
12340 }
12341
12343 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12344 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
12345 DAG.getConstant(I, DL, MVT::i32));
12346
12347 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12348 }
12349
12350 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12351
12352 return DAG.getMergeValues(Ops, DL);
12353 }
12354
12355 if (!MemVT.isVector())
12356 return SDValue();
12357
12358 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12359 "Custom lowering for non-i32 vectors hasn't been implemented.");
12360
12361 Align Alignment = Load->getAlign();
12362 unsigned AS = Load->getAddressSpace();
12363 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12364 AS == AMDGPUAS::FLAT_ADDRESS &&
12365 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12366 return SplitVectorLoad(Op, DAG);
12367 }
12368
12369 MachineFunction &MF = DAG.getMachineFunction();
12370 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12371 // If there is a possibility that flat instruction access scratch memory
12372 // then we need to use the same legalization rules we use for private.
12373 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12374 !Subtarget->hasMultiDwordFlatScratchAddressing())
12375 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12378
12379 unsigned NumElements = MemVT.getVectorNumElements();
12380
12381 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12383 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12384 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12385 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12386 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12387 Alignment >= Align(4) && NumElements < 32) {
12388 if (MemVT.isPow2VectorType() ||
12389 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12390 return SDValue();
12391 return WidenOrSplitVectorLoad(Op, DAG);
12392 }
12393 // Non-uniform loads will be selected to MUBUF instructions, so they
12394 // have the same legalization requirements as global and private
12395 // loads.
12396 //
12397 }
12398 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12401 if (NumElements > 4)
12402 return SplitVectorLoad(Op, DAG);
12403 // v3 loads not supported on SI.
12404 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12405 return WidenOrSplitVectorLoad(Op, DAG);
12406
12407 // v3 and v4 loads are supported for private and global memory.
12408 return SDValue();
12409 }
12410 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12411 // Depending on the setting of the private_element_size field in the
12412 // resource descriptor, we can only make private accesses up to a certain
12413 // size.
12414 switch (Subtarget->getMaxPrivateElementSize()) {
12415 case 4: {
12416 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12417 return DAG.getMergeValues({Op0, Op1}, DL);
12418 }
12419 case 8:
12420 if (NumElements > 2)
12421 return SplitVectorLoad(Op, DAG);
12422 return SDValue();
12423 case 16:
12424 // Same as global/flat
12425 if (NumElements > 4)
12426 return SplitVectorLoad(Op, DAG);
12427 // v3 loads not supported on SI.
12428 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12429 return WidenOrSplitVectorLoad(Op, DAG);
12430
12431 return SDValue();
12432 default:
12433 llvm_unreachable("unsupported private_element_size");
12434 }
12435 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12436 unsigned Fast = 0;
12437 auto Flags = Load->getMemOperand()->getFlags();
12439 Load->getAlign(), Flags, &Fast) &&
12440 Fast > 1)
12441 return SDValue();
12442
12443 if (MemVT.isVector())
12444 return SplitVectorLoad(Op, DAG);
12445 }
12446
12448 MemVT, *Load->getMemOperand())) {
12449 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12450 return DAG.getMergeValues({Op0, Op1}, DL);
12451 }
12452
12453 return SDValue();
12454}
12455
12456SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12457 EVT VT = Op.getValueType();
12458 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12459 VT.getSizeInBits() == 512)
12460 return splitTernaryVectorOp(Op, DAG);
12461
12462 assert(VT.getSizeInBits() == 64);
12463
12464 SDLoc DL(Op);
12465 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12466
12467 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12468 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12469
12470 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12471 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12472
12473 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12474 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12475
12476 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12477
12478 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12479 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12480
12481 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12482
12483 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12484 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12485}
12486
12487// Catch division cases where we can use shortcuts with rcp and rsq
12488// instructions.
12489SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12490 SelectionDAG &DAG) const {
12491 SDLoc SL(Op);
12492 SDValue LHS = Op.getOperand(0);
12493 SDValue RHS = Op.getOperand(1);
12494 EVT VT = Op.getValueType();
12495 const SDNodeFlags Flags = Op->getFlags();
12496
12497 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12498
12499 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12500 // Without !fpmath accuracy information, we can't do more because we don't
12501 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12502 // f16 is always accurate enough
12503 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12504 return SDValue();
12505
12506 if (CLHS->isExactlyValue(1.0)) {
12507 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12508 // the CI documentation has a worst case error of 1 ulp.
12509 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12510 // use it as long as we aren't trying to use denormals.
12511 //
12512 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12513
12514 // 1.0 / sqrt(x) -> rsq(x)
12515
12516 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12517 // error seems really high at 2^29 ULP.
12518 // 1.0 / x -> rcp(x)
12519 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12520 }
12521
12522 // Same as for 1.0, but expand the sign out of the constant.
12523 if (CLHS->isExactlyValue(-1.0)) {
12524 // -1.0 / x -> rcp (fneg x)
12525 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12526 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12527 }
12528 }
12529
12530 // For f16 and bf16 require afn or arcp.
12531 // For f32 require afn.
12532 if (!AllowInaccurateRcp &&
12533 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12534 return SDValue();
12535
12536 // Turn into multiply by the reciprocal.
12537 // x / y -> x * (1.0 / y)
12538 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12539 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12540}
12541
12542SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12543 SelectionDAG &DAG) const {
12544 SDLoc SL(Op);
12545 SDValue X = Op.getOperand(0);
12546 SDValue Y = Op.getOperand(1);
12547 EVT VT = Op.getValueType();
12548 const SDNodeFlags Flags = Op->getFlags();
12549
12550 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12551 if (!AllowInaccurateDiv)
12552 return SDValue();
12553
12554 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12555 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12556
12557 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12558 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12559
12560 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12561 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12562 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12563 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12564 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12565 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12566}
12567
12568static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12569 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12570 SDNodeFlags Flags) {
12571 if (GlueChain->getNumValues() <= 1) {
12572 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12573 }
12574
12575 assert(GlueChain->getNumValues() == 3);
12576
12577 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12578 switch (Opcode) {
12579 default:
12580 llvm_unreachable("no chain equivalent for opcode");
12581 case ISD::FMUL:
12582 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12583 break;
12584 }
12585
12586 return DAG.getNode(Opcode, SL, VTList,
12587 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12588 Flags);
12589}
12590
12591static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12592 EVT VT, SDValue A, SDValue B, SDValue C,
12593 SDValue GlueChain, SDNodeFlags Flags) {
12594 if (GlueChain->getNumValues() <= 1) {
12595 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12596 }
12597
12598 assert(GlueChain->getNumValues() == 3);
12599
12600 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12601 switch (Opcode) {
12602 default:
12603 llvm_unreachable("no chain equivalent for opcode");
12604 case ISD::FMA:
12605 Opcode = AMDGPUISD::FMA_W_CHAIN;
12606 break;
12607 }
12608
12609 return DAG.getNode(Opcode, SL, VTList,
12610 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12611 Flags);
12612}
12613
12614SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12615 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12616 return FastLowered;
12617
12618 SDLoc SL(Op);
12619 EVT VT = Op.getValueType();
12620 SDValue LHS = Op.getOperand(0);
12621 SDValue RHS = Op.getOperand(1);
12622
12623 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12624 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12625
12626 if (VT == MVT::bf16) {
12627 SDValue ExtDiv =
12628 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12629 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12630 DAG.getTargetConstant(0, SL, MVT::i32));
12631 }
12632
12633 assert(VT == MVT::f16);
12634
12635 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12636 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12637 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12638 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12639 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12640 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12641 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12642 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12643 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12644 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12645 // q16.u = opx(V_CVT_F16_F32, q32.u);
12646 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12647
12648 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12649 unsigned FMADOpCode =
12651 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12652 SDValue Rcp =
12653 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12654 SDValue Quot =
12655 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12656 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12657 Op->getFlags());
12658 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12659 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12660 Op->getFlags());
12661 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12662 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12663 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12664 DAG.getConstant(0xff800000, SL, MVT::i32));
12665 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12666 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12667 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12668 DAG.getTargetConstant(0, SL, MVT::i32));
12669 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12670 Op->getFlags());
12671}
12672
12673// Faster 2.5 ULP division that does not support denormals.
12674SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12675 SDNodeFlags Flags = Op->getFlags();
12676 SDLoc SL(Op);
12677 SDValue LHS = Op.getOperand(1);
12678 SDValue RHS = Op.getOperand(2);
12679
12680 // TODO: The combiner should probably handle elimination of redundant fabs.
12682 ? RHS
12683 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12684
12685 const APFloat K0Val(0x1p+96f);
12686 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12687
12688 const APFloat K1Val(0x1p-32f);
12689 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12690
12691 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12692
12693 EVT SetCCVT =
12694 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12695
12696 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12697
12698 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12699
12700 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12701
12702 // rcp does not support denormals.
12703 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12704
12705 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12706
12707 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12708}
12709
12710// Returns immediate value for setting the F32 denorm mode when using the
12711// S_DENORM_MODE instruction.
12713 const SIMachineFunctionInfo *Info,
12714 const GCNSubtarget *ST) {
12715 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12716 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12717 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12718 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12719}
12720
12721SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12722 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12723 return FastLowered;
12724
12725 // The selection matcher assumes anything with a chain selecting to a
12726 // mayRaiseFPException machine instruction. Since we're introducing a chain
12727 // here, we need to explicitly report nofpexcept for the regular fdiv
12728 // lowering.
12729 SDNodeFlags Flags = Op->getFlags();
12730 Flags.setNoFPExcept(true);
12731
12732 SDLoc SL(Op);
12733 SDValue LHS = Op.getOperand(0);
12734 SDValue RHS = Op.getOperand(1);
12735
12736 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12737
12738 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12739
12740 SDValue DenominatorScaled =
12741 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12742 SDValue NumeratorScaled =
12743 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12744
12745 // Denominator is scaled to not be denormal, so using rcp is ok.
12746 SDValue ApproxRcp =
12747 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12748 SDValue NegDivScale0 =
12749 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12750
12751 using namespace AMDGPU::Hwreg;
12752 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12753 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12754
12755 const MachineFunction &MF = DAG.getMachineFunction();
12756 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12757 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12758
12759 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12760 const bool HasDynamicDenormals =
12761 (DenormMode.Input == DenormalMode::Dynamic) ||
12762 (DenormMode.Output == DenormalMode::Dynamic);
12763
12764 SDValue SavedDenormMode;
12765
12766 if (!PreservesDenormals) {
12767 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12768 // lowering. The chain dependence is insufficient, and we need glue. We do
12769 // not need the glue variants in a strictfp function.
12770
12771 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12772
12773 SDValue Glue = DAG.getEntryNode();
12774 if (HasDynamicDenormals) {
12775 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12776 DAG.getVTList(MVT::i32, MVT::Glue),
12777 {BitField, Glue});
12778 SavedDenormMode = SDValue(GetReg, 0);
12779
12780 Glue = DAG.getMergeValues(
12781 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12782 }
12783
12784 SDNode *EnableDenorm;
12785 if (Subtarget->hasDenormModeInst()) {
12786 const SDValue EnableDenormValue =
12787 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
12788
12789 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12790 EnableDenormValue)
12791 .getNode();
12792 } else {
12793 const SDValue EnableDenormValue =
12794 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12795 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12796 {EnableDenormValue, BitField, Glue});
12797 }
12798
12799 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12800 SDValue(EnableDenorm, 1)};
12801
12802 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12803 }
12804
12805 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12806 ApproxRcp, One, NegDivScale0, Flags);
12807
12808 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12809 ApproxRcp, Fma0, Flags);
12810
12811 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12812 Fma1, Flags);
12813
12814 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12815 NumeratorScaled, Mul, Flags);
12816
12817 SDValue Fma3 =
12818 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12819
12820 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12821 NumeratorScaled, Fma3, Flags);
12822
12823 if (!PreservesDenormals) {
12824 SDNode *DisableDenorm;
12825 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12826 const SDValue DisableDenormValue = getSPDenormModeValue(
12827 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12828
12829 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12830 DisableDenorm =
12831 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12832 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12833 .getNode();
12834 } else {
12835 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12836 const SDValue DisableDenormValue =
12837 HasDynamicDenormals
12838 ? SavedDenormMode
12839 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12840
12841 DisableDenorm = DAG.getMachineNode(
12842 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12843 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12844 }
12845
12846 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12847 SDValue(DisableDenorm, 0), DAG.getRoot());
12848 DAG.setRoot(OutputChain);
12849 }
12850
12851 SDValue Scale = NumeratorScaled.getValue(1);
12852 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12853 {Fma4, Fma1, Fma3, Scale}, Flags);
12854
12855 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12856}
12857
12858SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12859 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12860 return FastLowered;
12861
12862 SDLoc SL(Op);
12863 SDValue X = Op.getOperand(0);
12864 SDValue Y = Op.getOperand(1);
12865
12866 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12867
12868 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12869
12870 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12871
12872 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12873
12874 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12875
12876 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12877
12878 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12879
12880 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12881
12882 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12883
12884 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12885 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12886
12887 SDValue Fma4 =
12888 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12889
12890 SDValue Scale;
12891
12892 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12893 // Workaround a hardware bug on SI where the condition output from div_scale
12894 // is not usable.
12895
12896 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12897
12898 // Figure out if the scale to use for div_fmas.
12899 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12900 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12901 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12902 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12903
12904 SDValue NumHi =
12905 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12906 SDValue DenHi =
12907 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12908
12909 SDValue Scale0Hi =
12910 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12911 SDValue Scale1Hi =
12912 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12913
12914 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12915 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12916 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12917 } else {
12918 Scale = DivScale1.getValue(1);
12919 }
12920
12921 SDValue Fmas =
12922 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12923
12924 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12925}
12926
12927SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12928 EVT VT = Op.getValueType();
12929
12930 if (VT == MVT::f32)
12931 return LowerFDIV32(Op, DAG);
12932
12933 if (VT == MVT::f64)
12934 return LowerFDIV64(Op, DAG);
12935
12936 if (VT == MVT::f16 || VT == MVT::bf16)
12937 return LowerFDIV16(Op, DAG);
12938
12939 llvm_unreachable("Unexpected type for fdiv");
12940}
12941
12942SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12943 SDLoc dl(Op);
12944 SDValue Val = Op.getOperand(0);
12945 EVT VT = Val.getValueType();
12946 EVT ResultExpVT = Op->getValueType(1);
12947 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12948
12949 SDValue Mant = DAG.getNode(
12951 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12952
12953 SDValue Exp = DAG.getNode(
12954 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12955 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12956
12957 if (Subtarget->hasFractBug()) {
12958 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12959 SDValue Inf =
12961
12962 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12963 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12964 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12965 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12966 }
12967
12968 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12969 return DAG.getMergeValues({Mant, CastExp}, dl);
12970}
12971
12972SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12973 SDLoc DL(Op);
12974 StoreSDNode *Store = cast<StoreSDNode>(Op);
12975 EVT VT = Store->getMemoryVT();
12976
12977 if (VT == MVT::i1) {
12978 return DAG.getTruncStore(
12979 Store->getChain(), DL,
12980 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12981 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12982 }
12983
12984 assert(VT.isVector() &&
12985 Store->getValue().getValueType().getScalarType() == MVT::i32);
12986
12987 unsigned AS = Store->getAddressSpace();
12988 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12989 AS == AMDGPUAS::FLAT_ADDRESS &&
12990 Store->getAlign().value() < VT.getStoreSize() &&
12991 VT.getSizeInBits() > 32) {
12992 return SplitVectorStore(Op, DAG);
12993 }
12994
12995 MachineFunction &MF = DAG.getMachineFunction();
12996 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12997 // If there is a possibility that flat instruction access scratch memory
12998 // then we need to use the same legalization rules we use for private.
12999 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13000 !Subtarget->hasMultiDwordFlatScratchAddressing())
13001 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
13004
13005 unsigned NumElements = VT.getVectorNumElements();
13007 if (NumElements > 4)
13008 return SplitVectorStore(Op, DAG);
13009 // v3 stores not supported on SI.
13010 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13011 return SplitVectorStore(Op, DAG);
13012
13014 VT, *Store->getMemOperand()))
13015 return expandUnalignedStore(Store, DAG);
13016
13017 return SDValue();
13018 }
13019 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13020 switch (Subtarget->getMaxPrivateElementSize()) {
13021 case 4:
13022 return scalarizeVectorStore(Store, DAG);
13023 case 8:
13024 if (NumElements > 2)
13025 return SplitVectorStore(Op, DAG);
13026 return SDValue();
13027 case 16:
13028 if (NumElements > 4 ||
13029 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13030 return SplitVectorStore(Op, DAG);
13031 return SDValue();
13032 default:
13033 llvm_unreachable("unsupported private_element_size");
13034 }
13035 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13036 unsigned Fast = 0;
13037 auto Flags = Store->getMemOperand()->getFlags();
13039 Store->getAlign(), Flags, &Fast) &&
13040 Fast > 1)
13041 return SDValue();
13042
13043 if (VT.isVector())
13044 return SplitVectorStore(Op, DAG);
13045
13046 return expandUnalignedStore(Store, DAG);
13047 }
13048
13049 // Probably an invalid store. If so we'll end up emitting a selection error.
13050 return SDValue();
13051}
13052
13053// Avoid the full correct expansion for f32 sqrt when promoting from f16.
13054SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
13055 SDLoc SL(Op);
13056 assert(!Subtarget->has16BitInsts());
13057 SDNodeFlags Flags = Op->getFlags();
13058 SDValue Ext =
13059 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
13060
13061 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
13062 SDValue Sqrt =
13063 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
13064
13065 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
13066 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
13067}
13068
13069SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13070 SDLoc DL(Op);
13071 SDNodeFlags Flags = Op->getFlags();
13072 MVT VT = Op.getValueType().getSimpleVT();
13073 const SDValue X = Op.getOperand(0);
13074
13075 if (allowApproxFunc(DAG, Flags)) {
13076 // Instruction is 1ulp but ignores denormals.
13077 return DAG.getNode(
13079 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
13080 }
13081
13082 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
13083 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
13084
13085 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
13086
13087 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
13088
13089 SDValue SqrtX =
13090 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
13091
13092 SDValue SqrtS;
13093 if (needsDenormHandlingF32(DAG, X, Flags)) {
13094 SDValue SqrtID =
13095 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
13096 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
13097
13098 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
13099 SDValue SqrtSNextDownInt =
13100 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13101 DAG.getAllOnesConstant(DL, MVT::i32));
13102 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
13103
13104 SDValue NegSqrtSNextDown =
13105 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
13106
13107 SDValue SqrtVP =
13108 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
13109
13110 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13111 DAG.getConstant(1, DL, MVT::i32));
13112 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
13113
13114 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
13115 SDValue SqrtVS =
13116 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
13117
13118 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
13119 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
13120
13121 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
13122 Flags);
13123
13124 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
13125 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
13126 Flags);
13127 } else {
13128 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
13129
13130 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
13131
13132 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
13133 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
13134 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
13135
13136 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
13137 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
13138 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
13139
13140 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
13141 SDValue SqrtD =
13142 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
13143 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
13144 }
13145
13146 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
13147
13148 SDValue ScaledDown =
13149 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
13150
13151 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
13152 SDValue IsZeroOrInf =
13153 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13154 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13155
13156 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
13157}
13158
13159SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13160 // For double type, the SQRT and RSQ instructions don't have required
13161 // precision, we apply Goldschmidt's algorithm to improve the result:
13162 //
13163 // y0 = rsq(x)
13164 // g0 = x * y0
13165 // h0 = 0.5 * y0
13166 //
13167 // r0 = 0.5 - h0 * g0
13168 // g1 = g0 * r0 + g0
13169 // h1 = h0 * r0 + h0
13170 //
13171 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
13172 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
13173 // h2 = h1 * r1 + h1
13174 //
13175 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
13176 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
13177 //
13178 // sqrt(x) = g3
13179
13180 SDNodeFlags Flags = Op->getFlags();
13181
13182 SDLoc DL(Op);
13183
13184 SDValue X = Op.getOperand(0);
13185 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
13186
13187 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
13188
13189 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
13190
13191 // Scale up input if it is too small.
13192 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
13193 SDValue ScaleUp =
13194 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
13195 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
13196
13197 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
13198
13199 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
13200
13201 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
13202 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
13203
13204 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
13205 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
13206
13207 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
13208
13209 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
13210
13211 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
13212 SDValue SqrtD0 =
13213 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
13214
13215 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
13216
13217 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
13218 SDValue SqrtD1 =
13219 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
13220
13221 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
13222
13223 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
13224 SDValue ScaleDown =
13225 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
13226 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
13227
13228 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
13229 // with finite only or nsz because rsq(+/-0) = +/-inf
13230
13231 // TODO: Check for DAZ and expand to subnormals
13232 SDValue IsZeroOrInf =
13233 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13234 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13235
13236 // If x is +INF, +0, or -0, use its original value
13237 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
13238 Flags);
13239}
13240
13241SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
13242 SDLoc DL(Op);
13243 EVT VT = Op.getValueType();
13244 SDValue Arg = Op.getOperand(0);
13245 SDValue TrigVal;
13246
13247 // Propagate fast-math flags so that the multiply we introduce can be folded
13248 // if Arg is already the result of a multiply by constant.
13249 auto Flags = Op->getFlags();
13250
13251 // AMDGPUISD nodes of vector type must be unrolled here since
13252 // they will not be expanded elsewhere.
13253 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
13254 if (!V.getValueType().isVector())
13255 return V;
13256
13257 return DAG.UnrollVectorOp(cast<SDNode>(V));
13258 };
13259
13260 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
13261
13262 if (Subtarget->hasTrigReducedRange()) {
13263 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13264 TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));
13265 } else {
13266 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13267 }
13268
13269 switch (Op.getOpcode()) {
13270 case ISD::FCOS:
13271 TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
13272 break;
13273 case ISD::FSIN:
13274 TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
13275 break;
13276 default:
13277 llvm_unreachable("Wrong trig opcode");
13278 }
13279
13280 return UnrollIfVec(TrigVal);
13281}
13282
13283SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13284 SelectionDAG &DAG) const {
13285 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
13286 assert(AtomicNode->isCompareAndSwap());
13287 unsigned AS = AtomicNode->getAddressSpace();
13288
13289 // No custom lowering required for local address space
13291 return Op;
13292
13293 // Non-local address space requires custom lowering for atomic compare
13294 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13295 SDLoc DL(Op);
13296 SDValue ChainIn = Op.getOperand(0);
13297 SDValue Addr = Op.getOperand(1);
13298 SDValue Old = Op.getOperand(2);
13299 SDValue New = Op.getOperand(3);
13300 EVT VT = Op.getValueType();
13301 MVT SimpleVT = VT.getSimpleVT();
13302 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
13303
13304 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
13305 SDValue Ops[] = {ChainIn, Addr, NewOld};
13306
13307 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
13308 Op->getVTList(), Ops, VT,
13309 AtomicNode->getMemOperand());
13310}
13311
13312//===----------------------------------------------------------------------===//
13313// Custom DAG optimizations
13314//===----------------------------------------------------------------------===//
13315
13316SDValue
13317SITargetLowering::performUCharToFloatCombine(SDNode *N,
13318 DAGCombinerInfo &DCI) const {
13319 EVT VT = N->getValueType(0);
13320 EVT ScalarVT = VT.getScalarType();
13321 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13322 return SDValue();
13323
13324 SelectionDAG &DAG = DCI.DAG;
13325 SDLoc DL(N);
13326
13327 SDValue Src = N->getOperand(0);
13328 EVT SrcVT = Src.getValueType();
13329
13330 // TODO: We could try to match extracting the higher bytes, which would be
13331 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13332 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13333 // about in practice.
13334 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13335 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
13336 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
13337 DCI.AddToWorklist(Cvt.getNode());
13338
13339 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13340 if (ScalarVT != MVT::f32) {
13341 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
13342 DAG.getTargetConstant(0, DL, MVT::i32));
13343 }
13344 return Cvt;
13345 }
13346 }
13347
13348 return SDValue();
13349}
13350
13351SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13352 DAGCombinerInfo &DCI) const {
13353 SDValue MagnitudeOp = N->getOperand(0);
13354 SDValue SignOp = N->getOperand(1);
13355
13356 // The generic combine for fcopysign + fp cast is too conservative with
13357 // vectors, and also gets confused by the splitting we will perform here, so
13358 // peek through FP casts.
13359 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13360 SignOp.getOpcode() == ISD::FP_ROUND)
13361 SignOp = SignOp.getOperand(0);
13362
13363 SelectionDAG &DAG = DCI.DAG;
13364 SDLoc DL(N);
13365 EVT SignVT = SignOp.getValueType();
13366
13367 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13368 // lower half with a copy.
13369 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13370 EVT MagVT = MagnitudeOp.getValueType();
13371
13372 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13373
13374 if (MagVT.getScalarType() == MVT::f64) {
13375 EVT F32VT = MagVT.isVector()
13376 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13377 : MVT::v2f32;
13378
13379 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13380
13382 for (unsigned I = 0; I != NumElts; ++I) {
13383 SDValue MagLo =
13384 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13385 DAG.getConstant(2 * I, DL, MVT::i32));
13386 SDValue MagHi =
13387 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13388 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13389
13390 SDValue SignOpElt =
13391 MagVT.isVector()
13393 SignOp, DAG.getConstant(I, DL, MVT::i32))
13394 : SignOp;
13395
13396 SDValue HiOp =
13397 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13398
13399 SDValue Vector =
13400 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13401
13402 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13403 NewElts.push_back(NewElt);
13404 }
13405
13406 if (NewElts.size() == 1)
13407 return NewElts[0];
13408
13409 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13410 }
13411
13412 if (SignVT.getScalarType() != MVT::f64)
13413 return SDValue();
13414
13415 // Reduce width of sign operand, we only need the highest bit.
13416 //
13417 // fcopysign f64:x, f64:y ->
13418 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13419 // TODO: In some cases it might make sense to go all the way to f16.
13420
13421 EVT F32VT = MagVT.isVector()
13422 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13423 : MVT::v2f32;
13424
13425 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13426
13427 SmallVector<SDValue, 8> F32Signs;
13428 for (unsigned I = 0; I != NumElts; ++I) {
13429 // Take sign from odd elements of cast vector
13430 SDValue SignAsF32 =
13431 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13432 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13433 F32Signs.push_back(SignAsF32);
13434 }
13435
13436 SDValue NewSign =
13437 NumElts == 1
13438 ? F32Signs.back()
13440 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13441 F32Signs);
13442
13443 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13444 NewSign);
13445}
13446
13447// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13448// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13449// bits
13450
13451// This is a variant of
13452// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13453//
13454// The normal DAG combiner will do this, but only if the add has one use since
13455// that would increase the number of instructions.
13456//
13457// This prevents us from seeing a constant offset that can be folded into a
13458// memory instruction's addressing mode. If we know the resulting add offset of
13459// a pointer can be folded into an addressing offset, we can replace the pointer
13460// operand with the add of new constant offset. This eliminates one of the uses,
13461// and may allow the remaining use to also be simplified.
13462//
13463SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13464 EVT MemVT,
13465 DAGCombinerInfo &DCI) const {
13466 SDValue N0 = N->getOperand(0);
13467 SDValue N1 = N->getOperand(1);
13468
13469 // We only do this to handle cases where it's profitable when there are
13470 // multiple uses of the add, so defer to the standard combine.
13471 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13472 return SDValue();
13473
13474 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13475 if (!CN1)
13476 return SDValue();
13477
13478 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13479 if (!CAdd)
13480 return SDValue();
13481
13482 SelectionDAG &DAG = DCI.DAG;
13483
13484 if (N0->getOpcode() == ISD::OR &&
13485 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13486 return SDValue();
13487
13488 // If the resulting offset is too large, we can't fold it into the
13489 // addressing mode offset.
13490 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13491 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13492
13493 AddrMode AM;
13494 AM.HasBaseReg = true;
13495 AM.BaseOffs = Offset.getSExtValue();
13496 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13497 return SDValue();
13498
13499 SDLoc SL(N);
13500 EVT VT = N->getValueType(0);
13501
13502 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13503 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13504
13505 SDNodeFlags Flags;
13506 Flags.setNoUnsignedWrap(
13507 N->getFlags().hasNoUnsignedWrap() &&
13508 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13509
13510 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13511 // be sure that the new left operand is a proper base pointer.
13512 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13513}
13514
13515/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13516/// by the chain and intrinsic ID. Theoretically we would also need to check the
13517/// specific intrinsic, but they all place the pointer operand first.
13518static unsigned getBasePtrIndex(const MemSDNode *N) {
13519 switch (N->getOpcode()) {
13520 case ISD::STORE:
13523 return 2;
13524 default:
13525 return 1;
13526 }
13527}
13528
13529SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13530 DAGCombinerInfo &DCI) const {
13531 SelectionDAG &DAG = DCI.DAG;
13532
13533 unsigned PtrIdx = getBasePtrIndex(N);
13534 SDValue Ptr = N->getOperand(PtrIdx);
13535
13536 // TODO: We could also do this for multiplies.
13537 if (Ptr.getOpcode() == ISD::SHL) {
13538 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13539 N->getMemoryVT(), DCI);
13540 if (NewPtr) {
13541 SmallVector<SDValue, 8> NewOps(N->ops());
13542
13543 NewOps[PtrIdx] = NewPtr;
13544 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13545 }
13546 }
13547
13548 return SDValue();
13549}
13550
13551static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13552 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13553 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13554 (Opc == ISD::XOR && Val == 0);
13555}
13556
13557// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13558// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13559// integer combine opportunities since most 64-bit operations are decomposed
13560// this way. TODO: We won't want this for SALU especially if it is an inline
13561// immediate.
13562SDValue SITargetLowering::splitBinaryBitConstantOp(
13563 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13564 const ConstantSDNode *CRHS) const {
13565 uint64_t Val = CRHS->getZExtValue();
13566 uint32_t ValLo = Lo_32(Val);
13567 uint32_t ValHi = Hi_32(Val);
13568 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13569
13570 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13572 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13573 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13574 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13575 !CRHS->user_begin()->isDivergent())
13576 return SDValue();
13577
13578 // If we need to materialize a 64-bit immediate, it will be split up later
13579 // anyway. Avoid creating the harder to understand 64-bit immediate
13580 // materialization.
13581 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13582 }
13583
13584 return SDValue();
13585}
13586
13588 if (V.getValueType() != MVT::i1)
13589 return false;
13590 switch (V.getOpcode()) {
13591 default:
13592 break;
13593 case ISD::SETCC:
13594 case ISD::IS_FPCLASS:
13595 case AMDGPUISD::FP_CLASS:
13596 return true;
13597 case ISD::AND:
13598 case ISD::OR:
13599 case ISD::XOR:
13600 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13601 case ISD::SADDO:
13602 case ISD::UADDO:
13603 case ISD::SSUBO:
13604 case ISD::USUBO:
13605 case ISD::SMULO:
13606 case ISD::UMULO:
13607 return V.getResNo() == 1;
13609 unsigned IntrinsicID = V.getConstantOperandVal(0);
13610 switch (IntrinsicID) {
13611 case Intrinsic::amdgcn_is_shared:
13612 case Intrinsic::amdgcn_is_private:
13613 return true;
13614 default:
13615 return false;
13616 }
13617
13618 return false;
13619 }
13620 }
13621 return false;
13622}
13623
13624// If a constant has all zeroes or all ones within each byte return it.
13625// Otherwise return 0.
13627 // 0xff for any zero byte in the mask
13628 uint32_t ZeroByteMask = 0;
13629 if (!(C & 0x000000ff))
13630 ZeroByteMask |= 0x000000ff;
13631 if (!(C & 0x0000ff00))
13632 ZeroByteMask |= 0x0000ff00;
13633 if (!(C & 0x00ff0000))
13634 ZeroByteMask |= 0x00ff0000;
13635 if (!(C & 0xff000000))
13636 ZeroByteMask |= 0xff000000;
13637 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13638 if ((NonZeroByteMask & C) != NonZeroByteMask)
13639 return 0; // Partial bytes selected.
13640 return C;
13641}
13642
13643// Check if a node selects whole bytes from its operand 0 starting at a byte
13644// boundary while masking the rest. Returns select mask as in the v_perm_b32
13645// or -1 if not succeeded.
13646// Note byte select encoding:
13647// value 0-3 selects corresponding source byte;
13648// value 0xc selects zero;
13649// value 0xff selects 0xff.
13651 assert(V.getValueSizeInBits() == 32);
13652
13653 if (V.getNumOperands() != 2)
13654 return ~0;
13655
13656 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13657 if (!N1)
13658 return ~0;
13659
13660 uint32_t C = N1->getZExtValue();
13661
13662 switch (V.getOpcode()) {
13663 default:
13664 break;
13665 case ISD::AND:
13666 if (uint32_t ConstMask = getConstantPermuteMask(C))
13667 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13668 break;
13669
13670 case ISD::OR:
13671 if (uint32_t ConstMask = getConstantPermuteMask(C))
13672 return (0x03020100 & ~ConstMask) | ConstMask;
13673 break;
13674
13675 case ISD::SHL:
13676 if (C % 8)
13677 return ~0;
13678
13679 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13680
13681 case ISD::SRL:
13682 if (C % 8)
13683 return ~0;
13684
13685 return uint32_t(0x0c0c0c0c03020100ull >> C);
13686 }
13687
13688 return ~0;
13689}
13690
13691SDValue SITargetLowering::performAndCombine(SDNode *N,
13692 DAGCombinerInfo &DCI) const {
13693 if (DCI.isBeforeLegalize())
13694 return SDValue();
13695
13696 SelectionDAG &DAG = DCI.DAG;
13697 EVT VT = N->getValueType(0);
13698 SDValue LHS = N->getOperand(0);
13699 SDValue RHS = N->getOperand(1);
13700
13701 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13702 if (VT == MVT::i64 && CRHS) {
13703 if (SDValue Split =
13704 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13705 return Split;
13706 }
13707
13708 if (CRHS && VT == MVT::i32) {
13709 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13710 // nb = number of trailing zeroes in mask
13711 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13712 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13713 uint64_t Mask = CRHS->getZExtValue();
13714 unsigned Bits = llvm::popcount(Mask);
13715 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13716 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13717 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13718 unsigned Shift = CShift->getZExtValue();
13719 unsigned NB = CRHS->getAPIntValue().countr_zero();
13720 unsigned Offset = NB + Shift;
13721 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13722 SDLoc SL(N);
13723 SDValue BFE =
13724 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13725 DAG.getConstant(Offset, SL, MVT::i32),
13726 DAG.getConstant(Bits, SL, MVT::i32));
13727 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13728 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13729 DAG.getValueType(NarrowVT));
13730 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13731 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13732 return Shl;
13733 }
13734 }
13735 }
13736
13737 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13738 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13739 isa<ConstantSDNode>(LHS.getOperand(2))) {
13740 uint32_t Sel = getConstantPermuteMask(Mask);
13741 if (!Sel)
13742 return SDValue();
13743
13744 // Select 0xc for all zero bytes
13745 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13746 SDLoc DL(N);
13747 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13748 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13749 }
13750 }
13751
13752 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13753 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13754 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13755 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13756 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13757
13758 SDValue X = LHS.getOperand(0);
13759 SDValue Y = RHS.getOperand(0);
13760 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13761 !isTypeLegal(X.getValueType()))
13762 return SDValue();
13763
13764 if (LCC == ISD::SETO) {
13765 if (X != LHS.getOperand(1))
13766 return SDValue();
13767
13768 if (RCC == ISD::SETUNE) {
13769 const ConstantFPSDNode *C1 =
13770 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13771 if (!C1 || !C1->isInfinity() || C1->isNegative())
13772 return SDValue();
13773
13774 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13778
13779 static_assert(
13782 0x3ff) == Mask,
13783 "mask not equal");
13784
13785 SDLoc DL(N);
13786 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13787 DAG.getConstant(Mask, DL, MVT::i32));
13788 }
13789 }
13790 }
13791
13792 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13793 std::swap(LHS, RHS);
13794
13795 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13796 RHS.hasOneUse()) {
13797 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13798 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13799 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13800 // | n_nan)
13801 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13802 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13803 (RHS.getOperand(0) == LHS.getOperand(0) &&
13804 LHS.getOperand(0) == LHS.getOperand(1))) {
13805 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13806 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13807 : Mask->getZExtValue() & OrdMask;
13808
13809 SDLoc DL(N);
13810 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13811 DAG.getConstant(NewMask, DL, MVT::i32));
13812 }
13813 }
13814
13815 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13816 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13817 // and x, (sext cc from i1) => select cc, x, 0
13818 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13819 std::swap(LHS, RHS);
13820 if (isBoolSGPR(RHS.getOperand(0)))
13821 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13822 DAG.getConstant(0, SDLoc(N), MVT::i32));
13823 }
13824
13825 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13826 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13827 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13828 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13829 uint32_t LHSMask = getPermuteMask(LHS);
13830 uint32_t RHSMask = getPermuteMask(RHS);
13831 if (LHSMask != ~0u && RHSMask != ~0u) {
13832 // Canonicalize the expression in an attempt to have fewer unique masks
13833 // and therefore fewer registers used to hold the masks.
13834 if (LHSMask > RHSMask) {
13835 std::swap(LHSMask, RHSMask);
13836 std::swap(LHS, RHS);
13837 }
13838
13839 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13840 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13841 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13842 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13843
13844 // Check of we need to combine values from two sources within a byte.
13845 if (!(LHSUsedLanes & RHSUsedLanes) &&
13846 // If we select high and lower word keep it for SDWA.
13847 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13848 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13849 // Each byte in each mask is either selector mask 0-3, or has higher
13850 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13851 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13852 // mask which is not 0xff wins. By anding both masks we have a correct
13853 // result except that 0x0c shall be corrected to give 0x0c only.
13854 uint32_t Mask = LHSMask & RHSMask;
13855 for (unsigned I = 0; I < 32; I += 8) {
13856 uint32_t ByteSel = 0xff << I;
13857 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13858 Mask &= (0x0c << I) & 0xffffffff;
13859 }
13860
13861 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13862 // or 0x0c.
13863 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13864 SDLoc DL(N);
13865
13866 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13867 RHS.getOperand(0),
13868 DAG.getConstant(Sel, DL, MVT::i32));
13869 }
13870 }
13871 }
13872
13873 return SDValue();
13874}
13875
13876// A key component of v_perm is a mapping between byte position of the src
13877// operands, and the byte position of the dest. To provide such, we need: 1. the
13878// node that provides x byte of the dest of the OR, and 2. the byte of the node
13879// used to provide that x byte. calculateByteProvider finds which node provides
13880// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13881// and finds an ultimate src and byte position For example: The supported
13882// LoadCombine pattern for vector loads is as follows
13883// t1
13884// or
13885// / \
13886// t2 t3
13887// zext shl
13888// | | \
13889// t4 t5 16
13890// or anyext
13891// / \ |
13892// t6 t7 t8
13893// srl shl or
13894// / | / \ / \
13895// t9 t10 t11 t12 t13 t14
13896// trunc* 8 trunc* 8 and and
13897// | | / | | \
13898// t15 t16 t17 t18 t19 t20
13899// trunc* 255 srl -256
13900// | / \
13901// t15 t15 16
13902//
13903// *In this example, the truncs are from i32->i16
13904//
13905// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13906// respectively. calculateSrcByte would find (given node) -> ultimate src &
13907// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13908// After finding the mapping, we can combine the tree into vperm t15, t16,
13909// 0x05000407
13910
13911// Find the source and byte position from a node.
13912// \p DestByte is the byte position of the dest of the or that the src
13913// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13914// dest of the or byte. \p Depth tracks how many recursive iterations we have
13915// performed.
13916static const std::optional<ByteProvider<SDValue>>
13917calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13918 unsigned Depth = 0) {
13919 // We may need to recursively traverse a series of SRLs
13920 if (Depth >= 6)
13921 return std::nullopt;
13922
13923 if (Op.getValueSizeInBits() < 8)
13924 return std::nullopt;
13925
13926 if (Op.getValueType().isVector())
13927 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13928
13929 switch (Op->getOpcode()) {
13930 case ISD::TRUNCATE: {
13931 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13932 }
13933
13934 case ISD::ANY_EXTEND:
13935 case ISD::SIGN_EXTEND:
13936 case ISD::ZERO_EXTEND:
13938 SDValue NarrowOp = Op->getOperand(0);
13939 auto NarrowVT = NarrowOp.getValueType();
13940 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13941 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13942 NarrowVT = VTSign->getVT();
13943 }
13944 if (!NarrowVT.isByteSized())
13945 return std::nullopt;
13946 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13947
13948 if (SrcIndex >= NarrowByteWidth)
13949 return std::nullopt;
13950 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13951 }
13952
13953 case ISD::SRA:
13954 case ISD::SRL: {
13955 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13956 if (!ShiftOp)
13957 return std::nullopt;
13958
13959 uint64_t BitShift = ShiftOp->getZExtValue();
13960
13961 if (BitShift % 8 != 0)
13962 return std::nullopt;
13963
13964 SrcIndex += BitShift / 8;
13965
13966 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13967 }
13968
13969 default: {
13970 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13971 }
13972 }
13973 llvm_unreachable("fully handled switch");
13974}
13975
13976// For a byte position in the result of an Or, traverse the tree and find the
13977// node (and the byte of the node) which ultimately provides this {Or,
13978// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13979// the byte position of the Op that corresponds with the originally requested
13980// byte of the Or \p Depth tracks how many recursive iterations we have
13981// performed. \p StartingIndex is the originally requested byte of the Or
13982static const std::optional<ByteProvider<SDValue>>
13983calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13984 unsigned StartingIndex = 0) {
13985 // Finding Src tree of RHS of or typically requires at least 1 additional
13986 // depth
13987 if (Depth > 6)
13988 return std::nullopt;
13989
13990 unsigned BitWidth = Op.getScalarValueSizeInBits();
13991 if (BitWidth % 8 != 0)
13992 return std::nullopt;
13993 if (Index > BitWidth / 8 - 1)
13994 return std::nullopt;
13995
13996 bool IsVec = Op.getValueType().isVector();
13997 switch (Op.getOpcode()) {
13998 case ISD::OR: {
13999 if (IsVec)
14000 return std::nullopt;
14001
14002 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
14003 StartingIndex);
14004 if (!RHS)
14005 return std::nullopt;
14006 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14007 StartingIndex);
14008 if (!LHS)
14009 return std::nullopt;
14010 // A well formed Or will have two ByteProviders for each byte, one of which
14011 // is constant zero
14012 if (!LHS->isConstantZero() && !RHS->isConstantZero())
14013 return std::nullopt;
14014 if (!LHS || LHS->isConstantZero())
14015 return RHS;
14016 if (!RHS || RHS->isConstantZero())
14017 return LHS;
14018 return std::nullopt;
14019 }
14020
14021 case ISD::AND: {
14022 if (IsVec)
14023 return std::nullopt;
14024
14025 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14026 if (!BitMaskOp)
14027 return std::nullopt;
14028
14029 uint32_t BitMask = BitMaskOp->getZExtValue();
14030 // Bits we expect for our StartingIndex
14031 uint32_t IndexMask = 0xFF << (Index * 8);
14032
14033 if ((IndexMask & BitMask) != IndexMask) {
14034 // If the result of the and partially provides the byte, then it
14035 // is not well formatted
14036 if (IndexMask & BitMask)
14037 return std::nullopt;
14039 }
14040
14041 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
14042 }
14043
14044 case ISD::FSHR: {
14045 if (IsVec)
14046 return std::nullopt;
14047
14048 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
14049 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14050 if (!ShiftOp || Op.getValueType().isVector())
14051 return std::nullopt;
14052
14053 uint64_t BitsProvided = Op.getValueSizeInBits();
14054 if (BitsProvided % 8 != 0)
14055 return std::nullopt;
14056
14057 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14058 if (BitShift % 8)
14059 return std::nullopt;
14060
14061 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14062 uint64_t ByteShift = BitShift / 8;
14063
14064 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14065 uint64_t BytesProvided = BitsProvided / 8;
14066 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14067 NewIndex %= BytesProvided;
14068 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
14069 }
14070
14071 case ISD::SRA:
14072 case ISD::SRL: {
14073 if (IsVec)
14074 return std::nullopt;
14075
14076 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14077 if (!ShiftOp)
14078 return std::nullopt;
14079
14080 uint64_t BitShift = ShiftOp->getZExtValue();
14081 if (BitShift % 8)
14082 return std::nullopt;
14083
14084 auto BitsProvided = Op.getScalarValueSizeInBits();
14085 if (BitsProvided % 8 != 0)
14086 return std::nullopt;
14087
14088 uint64_t BytesProvided = BitsProvided / 8;
14089 uint64_t ByteShift = BitShift / 8;
14090 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
14091 // If the byte we are trying to provide (as tracked by index) falls in this
14092 // range, then the SRL provides the byte. The byte of interest of the src of
14093 // the SRL is Index + ByteShift
14094 return BytesProvided - ByteShift > Index
14095 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
14096 Index + ByteShift)
14098 }
14099
14100 case ISD::SHL: {
14101 if (IsVec)
14102 return std::nullopt;
14103
14104 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14105 if (!ShiftOp)
14106 return std::nullopt;
14107
14108 uint64_t BitShift = ShiftOp->getZExtValue();
14109 if (BitShift % 8 != 0)
14110 return std::nullopt;
14111 uint64_t ByteShift = BitShift / 8;
14112
14113 // If we are shifting by an amount greater than (or equal to)
14114 // the index we are trying to provide, then it provides 0s. If not,
14115 // then this bytes are not definitively 0s, and the corresponding byte
14116 // of interest is Index - ByteShift of the src
14117 return Index < ByteShift
14119 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
14120 Depth + 1, StartingIndex);
14121 }
14122 case ISD::ANY_EXTEND:
14123 case ISD::SIGN_EXTEND:
14124 case ISD::ZERO_EXTEND:
14126 case ISD::AssertZext:
14127 case ISD::AssertSext: {
14128 if (IsVec)
14129 return std::nullopt;
14130
14131 SDValue NarrowOp = Op->getOperand(0);
14132 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14133 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14134 Op->getOpcode() == ISD::AssertZext ||
14135 Op->getOpcode() == ISD::AssertSext) {
14136 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14137 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14138 }
14139 if (NarrowBitWidth % 8 != 0)
14140 return std::nullopt;
14141 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14142
14143 if (Index >= NarrowByteWidth)
14144 return Op.getOpcode() == ISD::ZERO_EXTEND
14145 ? std::optional<ByteProvider<SDValue>>(
14147 : std::nullopt;
14148 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
14149 }
14150
14151 case ISD::TRUNCATE: {
14152 if (IsVec)
14153 return std::nullopt;
14154
14155 uint64_t NarrowByteWidth = BitWidth / 8;
14156
14157 if (NarrowByteWidth >= Index) {
14158 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14159 StartingIndex);
14160 }
14161
14162 return std::nullopt;
14163 }
14164
14165 case ISD::CopyFromReg: {
14166 if (BitWidth / 8 > Index)
14167 return calculateSrcByte(Op, StartingIndex, Index);
14168
14169 return std::nullopt;
14170 }
14171
14172 case ISD::LOAD: {
14173 auto *L = cast<LoadSDNode>(Op.getNode());
14174
14175 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14176 if (NarrowBitWidth % 8 != 0)
14177 return std::nullopt;
14178 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14179
14180 // If the width of the load does not reach byte we are trying to provide for
14181 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
14182 // question
14183 if (Index >= NarrowByteWidth) {
14184 return L->getExtensionType() == ISD::ZEXTLOAD
14185 ? std::optional<ByteProvider<SDValue>>(
14187 : std::nullopt;
14188 }
14189
14190 if (NarrowByteWidth > Index) {
14191 return calculateSrcByte(Op, StartingIndex, Index);
14192 }
14193
14194 return std::nullopt;
14195 }
14196
14197 case ISD::BSWAP: {
14198 if (IsVec)
14199 return std::nullopt;
14200
14201 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
14202 Depth + 1, StartingIndex);
14203 }
14204
14206 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14207 if (!IdxOp)
14208 return std::nullopt;
14209 auto VecIdx = IdxOp->getZExtValue();
14210 auto ScalarSize = Op.getScalarValueSizeInBits();
14211 if (ScalarSize < 32)
14212 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14213 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
14214 StartingIndex, Index);
14215 }
14216
14217 case AMDGPUISD::PERM: {
14218 if (IsVec)
14219 return std::nullopt;
14220
14221 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14222 if (!PermMask)
14223 return std::nullopt;
14224
14225 auto IdxMask =
14226 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14227 if (IdxMask > 0x07 && IdxMask != 0x0c)
14228 return std::nullopt;
14229
14230 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14231 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14232
14233 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
14236 }
14237
14238 default: {
14239 return std::nullopt;
14240 }
14241 }
14242
14243 llvm_unreachable("fully handled switch");
14244}
14245
14246// Returns true if the Operand is a scalar and is 16 bits
14247static bool isExtendedFrom16Bits(SDValue &Operand) {
14248
14249 switch (Operand.getOpcode()) {
14250 case ISD::ANY_EXTEND:
14251 case ISD::SIGN_EXTEND:
14252 case ISD::ZERO_EXTEND: {
14253 auto OpVT = Operand.getOperand(0).getValueType();
14254 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
14255 }
14256 case ISD::LOAD: {
14257 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
14258 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
14259 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
14260 ExtType == ISD::EXTLOAD) {
14261 auto MemVT = L->getMemoryVT();
14262 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
14263 }
14264 return L->getMemoryVT().getSizeInBits() == 16;
14265 }
14266 default:
14267 return false;
14268 }
14269}
14270
14271// Returns true if the mask matches consecutive bytes, and the first byte
14272// begins at a power of 2 byte offset from 0th byte
14273static bool addresses16Bits(int Mask) {
14274 int Low8 = Mask & 0xff;
14275 int Hi8 = (Mask & 0xff00) >> 8;
14276
14277 assert(Low8 < 8 && Hi8 < 8);
14278 // Are the bytes contiguous in the order of increasing addresses.
14279 bool IsConsecutive = (Hi8 - Low8 == 1);
14280 // Is the first byte at location that is aligned for 16 bit instructions.
14281 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
14282 // In this case, we still need code to extract the 16 bit operand, so it
14283 // is better to use i8 v_perm
14284 bool Is16Aligned = !(Low8 % 2);
14285
14286 return IsConsecutive && Is16Aligned;
14287}
14288
14289// Do not lower into v_perm if the operands are actually 16 bit
14290// and the selected bits (based on PermMask) correspond with two
14291// easily addressable 16 bit operands.
14293 SDValue &OtherOp) {
14294 int Low16 = PermMask & 0xffff;
14295 int Hi16 = (PermMask & 0xffff0000) >> 16;
14296
14297 auto TempOp = peekThroughBitcasts(Op);
14298 auto TempOtherOp = peekThroughBitcasts(OtherOp);
14299
14300 auto OpIs16Bit =
14301 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
14302 if (!OpIs16Bit)
14303 return true;
14304
14305 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14306 isExtendedFrom16Bits(TempOtherOp);
14307 if (!OtherOpIs16Bit)
14308 return true;
14309
14310 // Do we cleanly address both
14311 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
14312}
14313
14315 unsigned DWordOffset) {
14316 SDValue Ret;
14317
14318 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14319 // ByteProvider must be at least 8 bits
14320 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14321
14322 if (TypeSize <= 32)
14323 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
14324
14325 if (Src.getValueType().isVector()) {
14326 auto ScalarTySize = Src.getScalarValueSizeInBits();
14327 auto ScalarTy = Src.getValueType().getScalarType();
14328 if (ScalarTySize == 32) {
14329 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
14330 DAG.getConstant(DWordOffset, SL, MVT::i32));
14331 }
14332 if (ScalarTySize > 32) {
14333 Ret = DAG.getNode(
14334 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
14335 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14336 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14337 if (ShiftVal)
14338 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
14339 DAG.getConstant(ShiftVal, SL, MVT::i32));
14340 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14341 }
14342
14343 assert(ScalarTySize < 32);
14344 auto NumElements = TypeSize / ScalarTySize;
14345 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14346 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14347 auto NumElementsIn32 = 32 / ScalarTySize;
14348 auto NumAvailElements = DWordOffset < Trunc32Elements
14349 ? NumElementsIn32
14350 : NumElements - NormalizedTrunc;
14351
14353 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
14354 NumAvailElements);
14355
14356 Ret = DAG.getBuildVector(
14357 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
14358 VecSrcs);
14359 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14360 }
14361
14362 /// Scalar Type
14363 auto ShiftVal = 32 * DWordOffset;
14364 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
14365 DAG.getConstant(ShiftVal, SL, MVT::i32));
14366 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14367}
14368
14370 SelectionDAG &DAG = DCI.DAG;
14371 [[maybe_unused]] EVT VT = N->getValueType(0);
14373
14374 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14375 assert(VT == MVT::i32);
14376 for (int i = 0; i < 4; i++) {
14377 // Find the ByteProvider that provides the ith byte of the result of OR
14378 std::optional<ByteProvider<SDValue>> P =
14379 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14380 // TODO support constantZero
14381 if (!P || P->isConstantZero())
14382 return SDValue();
14383
14384 PermNodes.push_back(*P);
14385 }
14386 if (PermNodes.size() != 4)
14387 return SDValue();
14388
14389 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14390 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14391 uint64_t PermMask = 0x00000000;
14392 for (size_t i = 0; i < PermNodes.size(); i++) {
14393 auto PermOp = PermNodes[i];
14394 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14395 // by sizeof(Src2) = 4
14396 int SrcByteAdjust = 4;
14397
14398 // If the Src uses a byte from a different DWORD, then it corresponds
14399 // with a difference source
14400 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14401 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14402 if (SecondSrc)
14403 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14404 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14405 return SDValue();
14406
14407 // Set the index of the second distinct Src node
14408 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14409 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14410 SrcByteAdjust = 0;
14411 }
14412 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14414 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14415 }
14416 SDLoc DL(N);
14417 SDValue Op = *PermNodes[FirstSrc.first].Src;
14418 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14419 assert(Op.getValueSizeInBits() == 32);
14420
14421 // Check that we are not just extracting the bytes in order from an op
14422 if (!SecondSrc) {
14423 int Low16 = PermMask & 0xffff;
14424 int Hi16 = (PermMask & 0xffff0000) >> 16;
14425
14426 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14427 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14428
14429 // The perm op would really just produce Op. So combine into Op
14430 if (WellFormedLow && WellFormedHi)
14431 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14432 }
14433
14434 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14435
14436 if (SecondSrc) {
14437 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14438 assert(OtherOp.getValueSizeInBits() == 32);
14439 }
14440
14441 // Check that we haven't just recreated the same FSHR node.
14442 if (N->getOpcode() == ISD::FSHR &&
14443 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14444 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14445 return SDValue();
14446
14447 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14448
14449 assert(Op.getValueType().isByteSized() &&
14450 OtherOp.getValueType().isByteSized());
14451
14452 // If the ultimate src is less than 32 bits, then we will only be
14453 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14454 // CalculateByteProvider would not have returned Op as source if we
14455 // used a byte that is outside its ValueType. Thus, we are free to
14456 // ANY_EXTEND as the extended bits are dont-cares.
14457 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14458 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14459
14460 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14461 DAG.getConstant(PermMask, DL, MVT::i32));
14462 }
14463 return SDValue();
14464}
14465
14466SDValue SITargetLowering::performOrCombine(SDNode *N,
14467 DAGCombinerInfo &DCI) const {
14468 SelectionDAG &DAG = DCI.DAG;
14469 SDValue LHS = N->getOperand(0);
14470 SDValue RHS = N->getOperand(1);
14471
14472 EVT VT = N->getValueType(0);
14473 if (VT == MVT::i1) {
14474 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14475 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14476 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14477 SDValue Src = LHS.getOperand(0);
14478 if (Src != RHS.getOperand(0))
14479 return SDValue();
14480
14481 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14482 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14483 if (!CLHS || !CRHS)
14484 return SDValue();
14485
14486 // Only 10 bits are used.
14487 static const uint32_t MaxMask = 0x3ff;
14488
14489 uint32_t NewMask =
14490 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14491 SDLoc DL(N);
14492 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14493 DAG.getConstant(NewMask, DL, MVT::i32));
14494 }
14495
14496 return SDValue();
14497 }
14498
14499 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14501 LHS.getOpcode() == AMDGPUISD::PERM &&
14502 isa<ConstantSDNode>(LHS.getOperand(2))) {
14503 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14504 if (!Sel)
14505 return SDValue();
14506
14507 Sel |= LHS.getConstantOperandVal(2);
14508 SDLoc DL(N);
14509 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14510 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14511 }
14512
14513 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14514 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14515 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14516 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14517
14518 // If all the uses of an or need to extract the individual elements, do not
14519 // attempt to lower into v_perm
14520 auto usesCombinedOperand = [](SDNode *OrUse) {
14521 // If we have any non-vectorized use, then it is a candidate for v_perm
14522 if (OrUse->getOpcode() != ISD::BITCAST ||
14523 !OrUse->getValueType(0).isVector())
14524 return true;
14525
14526 // If we have any non-vectorized use, then it is a candidate for v_perm
14527 for (auto *VUser : OrUse->users()) {
14528 if (!VUser->getValueType(0).isVector())
14529 return true;
14530
14531 // If the use of a vector is a store, then combining via a v_perm
14532 // is beneficial.
14533 // TODO -- whitelist more uses
14534 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14535 if (VUser->getOpcode() == VectorwiseOp)
14536 return true;
14537 }
14538 return false;
14539 };
14540
14541 if (!any_of(N->users(), usesCombinedOperand))
14542 return SDValue();
14543
14544 uint32_t LHSMask = getPermuteMask(LHS);
14545 uint32_t RHSMask = getPermuteMask(RHS);
14546
14547 if (LHSMask != ~0u && RHSMask != ~0u) {
14548 // Canonicalize the expression in an attempt to have fewer unique masks
14549 // and therefore fewer registers used to hold the masks.
14550 if (LHSMask > RHSMask) {
14551 std::swap(LHSMask, RHSMask);
14552 std::swap(LHS, RHS);
14553 }
14554
14555 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14556 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14557 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14558 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14559
14560 // Check of we need to combine values from two sources within a byte.
14561 if (!(LHSUsedLanes & RHSUsedLanes) &&
14562 // If we select high and lower word keep it for SDWA.
14563 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14564 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14565 // Kill zero bytes selected by other mask. Zero value is 0xc.
14566 LHSMask &= ~RHSUsedLanes;
14567 RHSMask &= ~LHSUsedLanes;
14568 // Add 4 to each active LHS lane
14569 LHSMask |= LHSUsedLanes & 0x04040404;
14570 // Combine masks
14571 uint32_t Sel = LHSMask | RHSMask;
14572 SDLoc DL(N);
14573
14574 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14575 RHS.getOperand(0),
14576 DAG.getConstant(Sel, DL, MVT::i32));
14577 }
14578 }
14579 if (LHSMask == ~0u || RHSMask == ~0u) {
14580 if (SDValue Perm = matchPERM(N, DCI))
14581 return Perm;
14582 }
14583 }
14584
14585 // Detect identity v2i32 OR and replace with identity source node.
14586 // Specifically an Or that has operands constructed from the same source node
14587 // via extract_vector_elt and build_vector. I.E.
14588 // v2i32 or(
14589 // v2i32 build_vector(
14590 // i32 extract_elt(%IdentitySrc, 0),
14591 // i32 0
14592 // ),
14593 // v2i32 build_vector(
14594 // i32 0,
14595 // i32 extract_elt(%IdentitySrc, 1)
14596 // ) )
14597 // =>
14598 // v2i32 %IdentitySrc
14599
14600 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14601 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14602
14603 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14604 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14605
14606 // Test for and normalise build vectors.
14607 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14608
14609 // Get the extract_vector_element operands.
14610 SDValue LEVE = LHS->getOperand(0);
14611 SDValue REVE = RHS->getOperand(1);
14612
14613 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14615 // Check that different elements from the same vector are
14616 // extracted.
14617 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14618 LEVE->getOperand(1) != REVE->getOperand(1)) {
14619 SDValue IdentitySrc = LEVE.getOperand(0);
14620 return IdentitySrc;
14621 }
14622 }
14623 }
14624 }
14625
14626 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14627 return SDValue();
14628
14629 // TODO: This could be a generic combine with a predicate for extracting the
14630 // high half of an integer being free.
14631
14632 // (or i64:x, (zero_extend i32:y)) ->
14633 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14634 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14635 RHS.getOpcode() != ISD::ZERO_EXTEND)
14636 std::swap(LHS, RHS);
14637
14638 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14639 SDValue ExtSrc = RHS.getOperand(0);
14640 EVT SrcVT = ExtSrc.getValueType();
14641 if (SrcVT == MVT::i32) {
14642 SDLoc SL(N);
14643 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14644 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14645
14646 DCI.AddToWorklist(LowOr.getNode());
14647 DCI.AddToWorklist(HiBits.getNode());
14648
14649 SDValue Vec =
14650 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14651 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14652 }
14653 }
14654
14655 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14656 if (CRHS) {
14657 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14658 N->getOperand(0), CRHS))
14659 return Split;
14660 }
14661
14662 return SDValue();
14663}
14664
14665SDValue SITargetLowering::performXorCombine(SDNode *N,
14666 DAGCombinerInfo &DCI) const {
14667 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14668 return RV;
14669
14670 SDValue LHS = N->getOperand(0);
14671 SDValue RHS = N->getOperand(1);
14672
14673 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14674 SelectionDAG &DAG = DCI.DAG;
14675
14676 EVT VT = N->getValueType(0);
14677 if (CRHS && VT == MVT::i64) {
14678 if (SDValue Split =
14679 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14680 return Split;
14681 }
14682
14683 // v2i32 (xor (vselect cc, x, y), K) ->
14684 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14685 // replaced with source modifiers when the select is lowered to CNDMASK.
14686 unsigned Opc = LHS.getOpcode();
14687 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14688 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14689 CRHS && CRHS->getAPIntValue().isSignMask()) {
14690 SDValue CC = LHS->getOperand(0);
14691 SDValue TRUE = LHS->getOperand(1);
14692 SDValue FALSE = LHS->getOperand(2);
14693 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14694 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14695 SDValue XSelect =
14696 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14697 return XSelect;
14698 }
14699
14700 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14701 // fneg-like xors into 64-bit select.
14702 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14703 // This looks like an fneg, try to fold as a source modifier.
14704 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14706 // xor (select c, a, b), 0x80000000 ->
14707 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14708 SDLoc DL(N);
14709 SDValue CastLHS =
14710 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14711 SDValue CastRHS =
14712 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14713 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14714 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14715 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14716 LHS->getOperand(0), FNegLHS, FNegRHS);
14717 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14718 }
14719 }
14720
14721 return SDValue();
14722}
14723
14724SDValue
14725SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
14726 DAGCombinerInfo &DCI) const {
14727 if (!Subtarget->has16BitInsts() ||
14728 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
14729 return SDValue();
14730
14731 EVT VT = N->getValueType(0);
14732 if (VT != MVT::i32)
14733 return SDValue();
14734
14735 SDValue Src = N->getOperand(0);
14736 if (Src.getValueType() != MVT::i16)
14737 return SDValue();
14738
14739 if (!Src->hasOneUse())
14740 return SDValue();
14741
14742 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
14743 // possible we're missing out on some combine opportunities, but we'd need to
14744 // weigh the cost of extracting the byte from the upper dwords.
14745
14746 std::optional<ByteProvider<SDValue>> BP0 =
14747 calculateByteProvider(SDValue(N, 0), 0, 0, 0);
14748 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14749 return SDValue();
14750 SDValue V0 = *BP0->Src;
14751
14752 std::optional<ByteProvider<SDValue>> BP1 =
14753 calculateByteProvider(SDValue(N, 0), 1, 0, 1);
14754 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14755 return SDValue();
14756
14757 SDValue V1 = *BP1->Src;
14758
14759 if (V0 == V1)
14760 return SDValue();
14761
14762 SelectionDAG &DAG = DCI.DAG;
14763 SDLoc DL(N);
14764 uint32_t PermMask = 0x0c0c0c0c;
14765 if (V0) {
14766 V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32);
14767 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14768 }
14769
14770 if (V1) {
14771 V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32);
14772 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14773 }
14774
14775 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1,
14776 DAG.getConstant(PermMask, DL, MVT::i32));
14777}
14778
14779SDValue
14780SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14781 DAGCombinerInfo &DCI) const {
14782 SDValue Src = N->getOperand(0);
14783 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14784
14785 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14786 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14787 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14788 VTSign->getVT() == MVT::i8) ||
14789 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14790 VTSign->getVT() == MVT::i16))) {
14791 assert(Subtarget->hasScalarSubwordLoads() &&
14792 "s_buffer_load_{u8, i8} are supported "
14793 "in GFX12 (or newer) architectures.");
14794 EVT VT = Src.getValueType();
14795 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14796 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14797 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14798 SDLoc DL(N);
14799 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14800 SDValue Ops[] = {
14801 Src.getOperand(0), // source register
14802 Src.getOperand(1), // offset
14803 Src.getOperand(2) // cachePolicy
14804 };
14805 auto *M = cast<MemSDNode>(Src);
14806 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14807 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14808 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14809 return LoadVal;
14810 }
14811 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14812 VTSign->getVT() == MVT::i8) ||
14813 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14814 VTSign->getVT() == MVT::i16)) &&
14815 Src.hasOneUse()) {
14816 auto *M = cast<MemSDNode>(Src);
14817 SDValue Ops[] = {Src.getOperand(0), // Chain
14818 Src.getOperand(1), // rsrc
14819 Src.getOperand(2), // vindex
14820 Src.getOperand(3), // voffset
14821 Src.getOperand(4), // soffset
14822 Src.getOperand(5), // offset
14823 Src.getOperand(6), Src.getOperand(7)};
14824 // replace with BUFFER_LOAD_BYTE/SHORT
14825 SDVTList ResList =
14826 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14827 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14828 ? AMDGPUISD::BUFFER_LOAD_BYTE
14829 : AMDGPUISD::BUFFER_LOAD_SHORT;
14830 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14831 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14832 return DCI.DAG.getMergeValues(
14833 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14834 }
14835 return SDValue();
14836}
14837
14838SDValue SITargetLowering::performClassCombine(SDNode *N,
14839 DAGCombinerInfo &DCI) const {
14840 SelectionDAG &DAG = DCI.DAG;
14841 SDValue Mask = N->getOperand(1);
14842
14843 // fp_class x, 0 -> false
14844 if (isNullConstant(Mask))
14845 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14846
14847 if (N->getOperand(0).isUndef())
14848 return DAG.getUNDEF(MVT::i1);
14849
14850 return SDValue();
14851}
14852
14853SDValue SITargetLowering::performRcpCombine(SDNode *N,
14854 DAGCombinerInfo &DCI) const {
14855 EVT VT = N->getValueType(0);
14856 SDValue N0 = N->getOperand(0);
14857
14858 if (N0.isUndef()) {
14859 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14860 SDLoc(N), VT);
14861 }
14862
14863 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14864 N0.getOpcode() == ISD::SINT_TO_FP)) {
14865 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14866 N->getFlags());
14867 }
14868
14869 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14870 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14871 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14872 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14873 N->getFlags());
14874 }
14875
14877}
14878
14880 SDNodeFlags UserFlags,
14881 unsigned MaxDepth) const {
14882 unsigned Opcode = Op.getOpcode();
14883 if (Opcode == ISD::FCANONICALIZE)
14884 return true;
14885
14886 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14887 const auto &F = CFP->getValueAPF();
14888 if (F.isNaN() && F.isSignaling())
14889 return false;
14890 if (!F.isDenormal())
14891 return true;
14892
14893 DenormalMode Mode =
14894 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14895 return Mode == DenormalMode::getIEEE();
14896 }
14897
14898 // If source is a result of another standard FP operation it is already in
14899 // canonical form.
14900 if (MaxDepth == 0)
14901 return false;
14902
14903 switch (Opcode) {
14904 // These will flush denorms if required.
14905 case ISD::FADD:
14906 case ISD::FSUB:
14907 case ISD::FMUL:
14908 case ISD::FCEIL:
14909 case ISD::FFLOOR:
14910 case ISD::FMA:
14911 case ISD::FMAD:
14912 case ISD::FSQRT:
14913 case ISD::FDIV:
14914 case ISD::FREM:
14915 case ISD::FP_ROUND:
14916 case ISD::FP_EXTEND:
14917 case ISD::FP16_TO_FP:
14918 case ISD::FP_TO_FP16:
14919 case ISD::BF16_TO_FP:
14920 case ISD::FP_TO_BF16:
14921 case ISD::FLDEXP:
14922 case AMDGPUISD::FMUL_LEGACY:
14923 case AMDGPUISD::FMAD_FTZ:
14924 case AMDGPUISD::RCP:
14925 case AMDGPUISD::RSQ:
14926 case AMDGPUISD::RSQ_CLAMP:
14927 case AMDGPUISD::RCP_LEGACY:
14928 case AMDGPUISD::RCP_IFLAG:
14929 case AMDGPUISD::LOG:
14930 case AMDGPUISD::EXP:
14931 case AMDGPUISD::DIV_SCALE:
14932 case AMDGPUISD::DIV_FMAS:
14933 case AMDGPUISD::DIV_FIXUP:
14934 case AMDGPUISD::FRACT:
14935 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14936 case AMDGPUISD::CVT_F32_UBYTE0:
14937 case AMDGPUISD::CVT_F32_UBYTE1:
14938 case AMDGPUISD::CVT_F32_UBYTE2:
14939 case AMDGPUISD::CVT_F32_UBYTE3:
14940 case AMDGPUISD::FP_TO_FP16:
14941 case AMDGPUISD::SIN_HW:
14942 case AMDGPUISD::COS_HW:
14943 return true;
14944
14945 // It can/will be lowered or combined as a bit operation.
14946 // Need to check their input recursively to handle.
14947 case ISD::FNEG:
14948 case ISD::FABS:
14949 case ISD::FCOPYSIGN:
14950 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14951
14952 case ISD::AND:
14953 if (Op.getValueType() == MVT::i32) {
14954 // Be careful as we only know it is a bitcast floating point type. It
14955 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14956 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14957 // is valid to optimize for all types.
14958 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14959 if (RHS->getZExtValue() == 0xffff0000) {
14960 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14961 }
14962 }
14963 }
14964 break;
14965
14966 case ISD::FSIN:
14967 case ISD::FCOS:
14968 case ISD::FSINCOS:
14969 return Op.getValueType().getScalarType() != MVT::f16;
14970
14971 case ISD::FMINNUM:
14972 case ISD::FMAXNUM:
14973 case ISD::FMINNUM_IEEE:
14974 case ISD::FMAXNUM_IEEE:
14975 case ISD::FMINIMUM:
14976 case ISD::FMAXIMUM:
14977 case ISD::FMINIMUMNUM:
14978 case ISD::FMAXIMUMNUM:
14979 case AMDGPUISD::CLAMP:
14980 case AMDGPUISD::FMED3:
14981 case AMDGPUISD::FMAX3:
14982 case AMDGPUISD::FMIN3:
14983 case AMDGPUISD::FMAXIMUM3:
14984 case AMDGPUISD::FMINIMUM3: {
14985 // FIXME: Shouldn't treat the generic operations different based these.
14986 // However, we aren't really required to flush the result from
14987 // minnum/maxnum..
14988
14989 // snans will be quieted, so we only need to worry about denormals.
14990 if (Subtarget->supportsMinMaxDenormModes() ||
14991 // FIXME: denormalsEnabledForType is broken for dynamic
14992 denormalsEnabledForType(DAG, Op.getValueType()))
14993 return true;
14994
14995 // Flushing may be required.
14996 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14997 // targets need to check their input recursively.
14998
14999 // FIXME: Does this apply with clamp? It's implemented with max.
15000 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
15001 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
15002 return false;
15003 }
15004
15005 return true;
15006 }
15007 case ISD::SELECT: {
15008 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
15009 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
15010 }
15011 case ISD::BUILD_VECTOR: {
15012 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
15013 SDValue SrcOp = Op.getOperand(i);
15014 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
15015 return false;
15016 }
15017
15018 return true;
15019 }
15022 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15023 }
15025 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
15026 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
15027 }
15028 case ISD::UNDEF:
15029 // Could be anything.
15030 return false;
15031
15032 case ISD::BITCAST:
15033 // TODO: This is incorrect as it loses track of the operand's type. We may
15034 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
15035 // same bits that are canonicalized in one type need not be in the other.
15036 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15037 case ISD::TRUNCATE: {
15038 // Hack round the mess we make when legalizing extract_vector_elt
15039 if (Op.getValueType() == MVT::i16) {
15040 SDValue TruncSrc = Op.getOperand(0);
15041 if (TruncSrc.getValueType() == MVT::i32 &&
15042 TruncSrc.getOpcode() == ISD::BITCAST &&
15043 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
15044 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
15045 }
15046 }
15047 return false;
15048 }
15050 unsigned IntrinsicID = Op.getConstantOperandVal(0);
15051 // TODO: Handle more intrinsics
15052 switch (IntrinsicID) {
15053 case Intrinsic::amdgcn_cvt_pkrtz:
15054 case Intrinsic::amdgcn_cubeid:
15055 case Intrinsic::amdgcn_frexp_mant:
15056 case Intrinsic::amdgcn_fdot2:
15057 case Intrinsic::amdgcn_rcp:
15058 case Intrinsic::amdgcn_rsq:
15059 case Intrinsic::amdgcn_rsq_clamp:
15060 case Intrinsic::amdgcn_rcp_legacy:
15061 case Intrinsic::amdgcn_rsq_legacy:
15062 case Intrinsic::amdgcn_trig_preop:
15063 case Intrinsic::amdgcn_tanh:
15064 case Intrinsic::amdgcn_log:
15065 case Intrinsic::amdgcn_exp2:
15066 case Intrinsic::amdgcn_sqrt:
15067 return true;
15068 default:
15069 break;
15070 }
15071
15072 break;
15073 }
15074 default:
15075 break;
15076 }
15077
15078 // FIXME: denormalsEnabledForType is broken for dynamic
15079 return denormalsEnabledForType(DAG, Op.getValueType()) &&
15080 (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
15081}
15082
15084 unsigned MaxDepth) const {
15085 const MachineRegisterInfo &MRI = MF.getRegInfo();
15086 MachineInstr *MI = MRI.getVRegDef(Reg);
15087 unsigned Opcode = MI->getOpcode();
15088
15089 if (Opcode == AMDGPU::G_FCANONICALIZE)
15090 return true;
15091
15092 std::optional<FPValueAndVReg> FCR;
15093 // Constant splat (can be padded with undef) or scalar constant.
15095 if (FCR->Value.isSignaling())
15096 return false;
15097 if (!FCR->Value.isDenormal())
15098 return true;
15099
15100 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
15101 return Mode == DenormalMode::getIEEE();
15102 }
15103
15104 if (MaxDepth == 0)
15105 return false;
15106
15107 switch (Opcode) {
15108 case AMDGPU::G_FADD:
15109 case AMDGPU::G_FSUB:
15110 case AMDGPU::G_FMUL:
15111 case AMDGPU::G_FCEIL:
15112 case AMDGPU::G_FFLOOR:
15113 case AMDGPU::G_FRINT:
15114 case AMDGPU::G_FNEARBYINT:
15115 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15116 case AMDGPU::G_INTRINSIC_TRUNC:
15117 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15118 case AMDGPU::G_FMA:
15119 case AMDGPU::G_FMAD:
15120 case AMDGPU::G_FSQRT:
15121 case AMDGPU::G_FDIV:
15122 case AMDGPU::G_FREM:
15123 case AMDGPU::G_FPOW:
15124 case AMDGPU::G_FPEXT:
15125 case AMDGPU::G_FLOG:
15126 case AMDGPU::G_FLOG2:
15127 case AMDGPU::G_FLOG10:
15128 case AMDGPU::G_FPTRUNC:
15129 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15130 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15131 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15132 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15133 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15134 return true;
15135 case AMDGPU::G_FNEG:
15136 case AMDGPU::G_FABS:
15137 case AMDGPU::G_FCOPYSIGN:
15138 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
15139 case AMDGPU::G_FMINNUM:
15140 case AMDGPU::G_FMAXNUM:
15141 case AMDGPU::G_FMINNUM_IEEE:
15142 case AMDGPU::G_FMAXNUM_IEEE:
15143 case AMDGPU::G_FMINIMUM:
15144 case AMDGPU::G_FMAXIMUM:
15145 case AMDGPU::G_FMINIMUMNUM:
15146 case AMDGPU::G_FMAXIMUMNUM: {
15147 if (Subtarget->supportsMinMaxDenormModes() ||
15148 // FIXME: denormalsEnabledForType is broken for dynamic
15149 denormalsEnabledForType(MRI.getType(Reg), MF))
15150 return true;
15151
15152 [[fallthrough]];
15153 }
15154 case AMDGPU::G_BUILD_VECTOR:
15155 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
15156 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
15157 return false;
15158 return true;
15159 case AMDGPU::G_INTRINSIC:
15160 case AMDGPU::G_INTRINSIC_CONVERGENT:
15161 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15162 case Intrinsic::amdgcn_fmul_legacy:
15163 case Intrinsic::amdgcn_fmad_ftz:
15164 case Intrinsic::amdgcn_sqrt:
15165 case Intrinsic::amdgcn_fmed3:
15166 case Intrinsic::amdgcn_sin:
15167 case Intrinsic::amdgcn_cos:
15168 case Intrinsic::amdgcn_log:
15169 case Intrinsic::amdgcn_exp2:
15170 case Intrinsic::amdgcn_log_clamp:
15171 case Intrinsic::amdgcn_rcp:
15172 case Intrinsic::amdgcn_rcp_legacy:
15173 case Intrinsic::amdgcn_rsq:
15174 case Intrinsic::amdgcn_rsq_clamp:
15175 case Intrinsic::amdgcn_rsq_legacy:
15176 case Intrinsic::amdgcn_div_scale:
15177 case Intrinsic::amdgcn_div_fmas:
15178 case Intrinsic::amdgcn_div_fixup:
15179 case Intrinsic::amdgcn_fract:
15180 case Intrinsic::amdgcn_cvt_pkrtz:
15181 case Intrinsic::amdgcn_cubeid:
15182 case Intrinsic::amdgcn_cubema:
15183 case Intrinsic::amdgcn_cubesc:
15184 case Intrinsic::amdgcn_cubetc:
15185 case Intrinsic::amdgcn_frexp_mant:
15186 case Intrinsic::amdgcn_fdot2:
15187 case Intrinsic::amdgcn_trig_preop:
15188 case Intrinsic::amdgcn_tanh:
15189 return true;
15190 default:
15191 break;
15192 }
15193
15194 [[fallthrough]];
15195 default:
15196 return false;
15197 }
15198
15199 llvm_unreachable("invalid operation");
15200}
15201
15202// Constant fold canonicalize.
15203SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
15204 const SDLoc &SL, EVT VT,
15205 const APFloat &C) const {
15206 // Flush denormals to 0 if not enabled.
15207 if (C.isDenormal()) {
15208 DenormalMode Mode =
15209 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
15210 if (Mode == DenormalMode::getPreserveSign()) {
15211 return DAG.getConstantFP(
15212 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
15213 }
15214
15215 if (Mode != DenormalMode::getIEEE())
15216 return SDValue();
15217 }
15218
15219 if (C.isNaN()) {
15220 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
15221 if (C.isSignaling()) {
15222 // Quiet a signaling NaN.
15223 // FIXME: Is this supposed to preserve payload bits?
15224 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15225 }
15226
15227 // Make sure it is the canonical NaN bitpattern.
15228 //
15229 // TODO: Can we use -1 as the canonical NaN value since it's an inline
15230 // immediate?
15231 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
15232 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15233 }
15234
15235 // Already canonical.
15236 return DAG.getConstantFP(C, SL, VT);
15237}
15238
15240 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
15241}
15242
15243SDValue
15244SITargetLowering::performFCanonicalizeCombine(SDNode *N,
15245 DAGCombinerInfo &DCI) const {
15246 SelectionDAG &DAG = DCI.DAG;
15247 SDValue N0 = N->getOperand(0);
15248 EVT VT = N->getValueType(0);
15249
15250 // fcanonicalize undef -> qnan
15251 if (N0.isUndef()) {
15253 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
15254 }
15255
15256 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
15257 EVT VT = N->getValueType(0);
15258 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
15259 }
15260
15261 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
15262 // (fcanonicalize k)
15263 //
15264 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
15265
15266 // TODO: This could be better with wider vectors that will be split to v2f16,
15267 // and to consider uses since there aren't that many packed operations.
15268 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
15269 isTypeLegal(MVT::v2f16)) {
15270 SDLoc SL(N);
15271 SDValue NewElts[2];
15272 SDValue Lo = N0.getOperand(0);
15273 SDValue Hi = N0.getOperand(1);
15274 EVT EltVT = Lo.getValueType();
15275
15277 for (unsigned I = 0; I != 2; ++I) {
15278 SDValue Op = N0.getOperand(I);
15279 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
15280 NewElts[I] =
15281 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15282 } else if (Op.isUndef()) {
15283 // Handled below based on what the other operand is.
15284 NewElts[I] = Op;
15285 } else {
15286 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
15287 }
15288 }
15289
15290 // If one half is undef, and one is constant, prefer a splat vector rather
15291 // than the normal qNaN. If it's a register, prefer 0.0 since that's
15292 // cheaper to use and may be free with a packed operation.
15293 if (NewElts[0].isUndef()) {
15294 if (isa<ConstantFPSDNode>(NewElts[1]))
15295 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
15296 ? NewElts[1]
15297 : DAG.getConstantFP(0.0f, SL, EltVT);
15298 }
15299
15300 if (NewElts[1].isUndef()) {
15301 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
15302 ? NewElts[0]
15303 : DAG.getConstantFP(0.0f, SL, EltVT);
15304 }
15305
15306 return DAG.getBuildVector(VT, SL, NewElts);
15307 }
15308 }
15309
15310 return SDValue();
15311}
15312
15313static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15314 switch (Opc) {
15315 case ISD::FMAXNUM:
15316 case ISD::FMAXNUM_IEEE:
15317 case ISD::FMAXIMUMNUM:
15318 return AMDGPUISD::FMAX3;
15319 case ISD::FMAXIMUM:
15320 return AMDGPUISD::FMAXIMUM3;
15321 case ISD::SMAX:
15322 return AMDGPUISD::SMAX3;
15323 case ISD::UMAX:
15324 return AMDGPUISD::UMAX3;
15325 case ISD::FMINNUM:
15326 case ISD::FMINNUM_IEEE:
15327 case ISD::FMINIMUMNUM:
15328 return AMDGPUISD::FMIN3;
15329 case ISD::FMINIMUM:
15330 return AMDGPUISD::FMINIMUM3;
15331 case ISD::SMIN:
15332 return AMDGPUISD::SMIN3;
15333 case ISD::UMIN:
15334 return AMDGPUISD::UMIN3;
15335 default:
15336 llvm_unreachable("Not a min/max opcode");
15337 }
15338}
15339
15340SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15341 const SDLoc &SL, SDValue Src,
15342 SDValue MinVal,
15343 SDValue MaxVal,
15344 bool Signed) const {
15345
15346 // med3 comes from
15347 // min(max(x, K0), K1), K0 < K1
15348 // max(min(x, K0), K1), K1 < K0
15349 //
15350 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15351 // min/max op.
15352 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
15353 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
15354
15355 if (!MinK || !MaxK)
15356 return SDValue();
15357
15358 if (Signed) {
15359 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
15360 return SDValue();
15361 } else {
15362 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
15363 return SDValue();
15364 }
15365
15366 EVT VT = MinK->getValueType(0);
15367 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15368 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15369 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15370
15371 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15372 // not available, but this is unlikely to be profitable as constants
15373 // will often need to be materialized & extended, especially on
15374 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15375 return SDValue();
15376}
15377
15380 return C;
15381
15383 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15384 return C;
15385 }
15386
15387 return nullptr;
15388}
15389
15390SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15391 const SDLoc &SL, SDValue Op0,
15392 SDValue Op1) const {
15393 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
15394 if (!K1)
15395 return SDValue();
15396
15397 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
15398 if (!K0)
15399 return SDValue();
15400
15401 // Ordered >= (although NaN inputs should have folded away by now).
15402 if (K0->getValueAPF() > K1->getValueAPF())
15403 return SDValue();
15404
15405 // med3 with a nan input acts like
15406 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15407 //
15408 // So the result depends on whether the IEEE mode bit is enabled or not with a
15409 // signaling nan input.
15410 // ieee=1
15411 // s0 snan: yields s2
15412 // s1 snan: yields s2
15413 // s2 snan: qnan
15414
15415 // s0 qnan: min(s1, s2)
15416 // s1 qnan: min(s0, s2)
15417 // s2 qnan: min(s0, s1)
15418
15419 // ieee=0
15420 // s0 snan: min(s1, s2)
15421 // s1 snan: min(s0, s2)
15422 // s2 snan: qnan
15423
15424 // s0 qnan: min(s1, s2)
15425 // s1 qnan: min(s0, s2)
15426 // s2 qnan: min(s0, s1)
15427 const MachineFunction &MF = DAG.getMachineFunction();
15428 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15429
15430 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15431 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15432 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15433 EVT VT = Op0.getValueType();
15434 if (Info->getMode().DX10Clamp) {
15435 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15436 // hardware fmed3 behavior converting to a min.
15437 // FIXME: Should this be allowing -0.0?
15438 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15439 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15440 }
15441
15442 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15443 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15444 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15445 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15446 // then give the other result, which is different from med3 with a NaN
15447 // input.
15448 SDValue Var = Op0.getOperand(0);
15449 if (!DAG.isKnownNeverSNaN(Var))
15450 return SDValue();
15451
15452 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15453
15454 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15455 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15456 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15457 SDValue(K0, 0), SDValue(K1, 0));
15458 }
15459 }
15460
15461 return SDValue();
15462}
15463
15464/// \return true if the subtarget supports minimum3 and maximum3 with the given
15465/// base min/max opcode \p Opc for type \p VT.
15466static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15467 EVT VT) {
15468 switch (Opc) {
15469 case ISD::FMINNUM:
15470 case ISD::FMAXNUM:
15471 case ISD::FMINNUM_IEEE:
15472 case ISD::FMAXNUM_IEEE:
15473 case ISD::FMINIMUMNUM:
15474 case ISD::FMAXIMUMNUM:
15475 case AMDGPUISD::FMIN_LEGACY:
15476 case AMDGPUISD::FMAX_LEGACY:
15477 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15478 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15479 case ISD::FMINIMUM:
15480 case ISD::FMAXIMUM:
15481 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15482 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15483 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15484 case ISD::SMAX:
15485 case ISD::SMIN:
15486 case ISD::UMAX:
15487 case ISD::UMIN:
15488 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15489 default:
15490 return false;
15491 }
15492
15493 llvm_unreachable("not a min/max opcode");
15494}
15495
15496SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15497 DAGCombinerInfo &DCI) const {
15498 SelectionDAG &DAG = DCI.DAG;
15499
15500 EVT VT = N->getValueType(0);
15501 unsigned Opc = N->getOpcode();
15502 SDValue Op0 = N->getOperand(0);
15503 SDValue Op1 = N->getOperand(1);
15504
15505 // Only do this if the inner op has one use since this will just increases
15506 // register pressure for no benefit.
15507
15508 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15509 // max(max(a, b), c) -> max3(a, b, c)
15510 // min(min(a, b), c) -> min3(a, b, c)
15511 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15512 SDLoc DL(N);
15513 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15514 Op0.getOperand(0), Op0.getOperand(1), Op1);
15515 }
15516
15517 // Try commuted.
15518 // max(a, max(b, c)) -> max3(a, b, c)
15519 // min(a, min(b, c)) -> min3(a, b, c)
15520 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15521 SDLoc DL(N);
15522 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15523 Op0, Op1.getOperand(0), Op1.getOperand(1));
15524 }
15525 }
15526
15527 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15528 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15529 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15530 if (SDValue Med3 = performIntMed3ImmCombine(
15531 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15532 return Med3;
15533 }
15534 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15535 if (SDValue Med3 = performIntMed3ImmCombine(
15536 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15537 return Med3;
15538 }
15539
15540 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15541 if (SDValue Med3 = performIntMed3ImmCombine(
15542 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15543 return Med3;
15544 }
15545 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15546 if (SDValue Med3 = performIntMed3ImmCombine(
15547 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15548 return Med3;
15549 }
15550
15551 // if !is_snan(x):
15552 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15553 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15554 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15555 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15556 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15559 (Opc == AMDGPUISD::FMIN_LEGACY &&
15560 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15561 (VT == MVT::f32 || VT == MVT::f64 ||
15562 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15563 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15564 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15565 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15566 Op0.hasOneUse()) {
15567 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15568 return Res;
15569 }
15570
15571 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15572 // for some types, but at a higher cost since it's implemented with a 3
15573 // operand form.
15574 const SDNodeFlags Flags = N->getFlags();
15575 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
15576 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15578 unsigned NewOpc =
15580 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15581 }
15582
15583 return SDValue();
15584}
15585
15589 // FIXME: Should this be allowing -0.0?
15590 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15591 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15592 }
15593 }
15594
15595 return false;
15596}
15597
15598// FIXME: Should only worry about snans for version with chain.
15599SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15600 DAGCombinerInfo &DCI) const {
15601 EVT VT = N->getValueType(0);
15602 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15603 // NaNs. With a NaN input, the order of the operands may change the result.
15604
15605 SelectionDAG &DAG = DCI.DAG;
15606 SDLoc SL(N);
15607
15608 SDValue Src0 = N->getOperand(0);
15609 SDValue Src1 = N->getOperand(1);
15610 SDValue Src2 = N->getOperand(2);
15611
15612 if (isClampZeroToOne(Src0, Src1)) {
15613 // const_a, const_b, x -> clamp is safe in all cases including signaling
15614 // nans.
15615 // FIXME: Should this be allowing -0.0?
15616 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15617 }
15618
15619 const MachineFunction &MF = DAG.getMachineFunction();
15620 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15621
15622 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15623 // handling no dx10-clamp?
15624 if (Info->getMode().DX10Clamp) {
15625 // If NaNs is clamped to 0, we are free to reorder the inputs.
15626
15627 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15628 std::swap(Src0, Src1);
15629
15630 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15631 std::swap(Src1, Src2);
15632
15633 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15634 std::swap(Src0, Src1);
15635
15636 if (isClampZeroToOne(Src1, Src2))
15637 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15638 }
15639
15640 return SDValue();
15641}
15642
15643SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15644 DAGCombinerInfo &DCI) const {
15645 SDValue Src0 = N->getOperand(0);
15646 SDValue Src1 = N->getOperand(1);
15647 if (Src0.isUndef() && Src1.isUndef())
15648 return DCI.DAG.getUNDEF(N->getValueType(0));
15649 return SDValue();
15650}
15651
15652// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15653// expanded into a set of cmp/select instructions.
15655 unsigned NumElem,
15656 bool IsDivergentIdx,
15657 const GCNSubtarget *Subtarget) {
15659 return false;
15660
15661 unsigned VecSize = EltSize * NumElem;
15662
15663 // Sub-dword vectors of size 2 dword or less have better implementation.
15664 if (VecSize <= 64 && EltSize < 32)
15665 return false;
15666
15667 // Always expand the rest of sub-dword instructions, otherwise it will be
15668 // lowered via memory.
15669 if (EltSize < 32)
15670 return true;
15671
15672 // Always do this if var-idx is divergent, otherwise it will become a loop.
15673 if (IsDivergentIdx)
15674 return true;
15675
15676 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15677 unsigned NumInsts = NumElem /* Number of compares */ +
15678 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15679
15680 // On some architectures (GFX9) movrel is not available and it's better
15681 // to expand.
15682 if (Subtarget->useVGPRIndexMode())
15683 return NumInsts <= 16;
15684
15685 // If movrel is available, use it instead of expanding for vector of 8
15686 // elements.
15687 if (Subtarget->hasMovrel())
15688 return NumInsts <= 15;
15689
15690 return true;
15691}
15692
15694 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15695 if (isa<ConstantSDNode>(Idx))
15696 return false;
15697
15698 SDValue Vec = N->getOperand(0);
15699 EVT VecVT = Vec.getValueType();
15700 EVT EltVT = VecVT.getVectorElementType();
15701 unsigned EltSize = EltVT.getSizeInBits();
15702 unsigned NumElem = VecVT.getVectorNumElements();
15703
15705 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15706}
15707
15708SDValue
15709SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15710 DAGCombinerInfo &DCI) const {
15711 SDValue Vec = N->getOperand(0);
15712 SelectionDAG &DAG = DCI.DAG;
15713
15714 EVT VecVT = Vec.getValueType();
15715 EVT VecEltVT = VecVT.getVectorElementType();
15716 EVT ResVT = N->getValueType(0);
15717
15718 unsigned VecSize = VecVT.getSizeInBits();
15719 unsigned VecEltSize = VecEltVT.getSizeInBits();
15720
15721 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15723 SDLoc SL(N);
15724 SDValue Idx = N->getOperand(1);
15725 SDValue Elt =
15726 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15727 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15728 }
15729
15730 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15731 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15732 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15733 // depending on the shift operand. See e.g. performSraCombine().
15734 // This combine ensures that the optimisation is compatible with v2i32
15735 // legalised AND.
15736 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15737 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15738
15740 if (!C || C->getZExtValue() != 0x1f)
15741 return SDValue();
15742
15743 SDLoc SL(N);
15744 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15745 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15746 Vec->getOperand(0), N->getOperand(1));
15747 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15748 DAG.ReplaceAllUsesWith(N, A.getNode());
15749 }
15750
15751 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15752 // =>
15753 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15754 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15755 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15756 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15757 SDLoc SL(N);
15758 SDValue Idx = N->getOperand(1);
15759 unsigned Opc = Vec.getOpcode();
15760
15761 switch (Opc) {
15762 default:
15763 break;
15764 // TODO: Support other binary operations.
15765 case ISD::FADD:
15766 case ISD::FSUB:
15767 case ISD::FMUL:
15768 case ISD::ADD:
15769 case ISD::UMIN:
15770 case ISD::UMAX:
15771 case ISD::SMIN:
15772 case ISD::SMAX:
15773 case ISD::FMAXNUM:
15774 case ISD::FMINNUM:
15775 case ISD::FMAXNUM_IEEE:
15776 case ISD::FMINNUM_IEEE:
15777 case ISD::FMAXIMUM:
15778 case ISD::FMINIMUM: {
15779 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15780 Vec.getOperand(0), Idx);
15781 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15782 Vec.getOperand(1), Idx);
15783
15784 DCI.AddToWorklist(Elt0.getNode());
15785 DCI.AddToWorklist(Elt1.getNode());
15786 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15787 }
15788 }
15789 }
15790
15791 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15793 SDLoc SL(N);
15794 SDValue Idx = N->getOperand(1);
15795 SDValue V;
15796 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15797 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15798 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15799 if (I == 0)
15800 V = Elt;
15801 else
15802 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15803 }
15804 return V;
15805 }
15806
15807 if (!DCI.isBeforeLegalize())
15808 return SDValue();
15809
15810 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15811 // elements. This exposes more load reduction opportunities by replacing
15812 // multiple small extract_vector_elements with a single 32-bit extract.
15813 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15814 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15815 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15816 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15817
15818 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15819 unsigned EltIdx = BitIndex / 32;
15820 unsigned LeftoverBitIdx = BitIndex % 32;
15821 SDLoc SL(N);
15822
15823 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15824 DCI.AddToWorklist(Cast.getNode());
15825
15826 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15827 DAG.getConstant(EltIdx, SL, MVT::i32));
15828 DCI.AddToWorklist(Elt.getNode());
15829 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15830 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15831 DCI.AddToWorklist(Srl.getNode());
15832
15833 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15834 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15835 DCI.AddToWorklist(Trunc.getNode());
15836
15837 if (VecEltVT == ResVT) {
15838 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15839 }
15840
15841 assert(ResVT.isScalarInteger());
15842 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15843 }
15844
15845 return SDValue();
15846}
15847
15848SDValue
15849SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15850 DAGCombinerInfo &DCI) const {
15851 SDValue Vec = N->getOperand(0);
15852 SDValue Idx = N->getOperand(2);
15853 EVT VecVT = Vec.getValueType();
15854 EVT EltVT = VecVT.getVectorElementType();
15855
15856 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15857 // => BUILD_VECTOR n x select (e, const-idx)
15859 return SDValue();
15860
15861 SelectionDAG &DAG = DCI.DAG;
15862 SDLoc SL(N);
15863 SDValue Ins = N->getOperand(1);
15864 EVT IdxVT = Idx.getValueType();
15865
15867 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15868 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15869 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15870 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15871 Ops.push_back(V);
15872 }
15873
15874 return DAG.getBuildVector(VecVT, SL, Ops);
15875}
15876
15877/// Return the source of an fp_extend from f16 to f32, or a converted FP
15878/// constant.
15880 if (Src.getOpcode() == ISD::FP_EXTEND &&
15881 Src.getOperand(0).getValueType() == MVT::f16) {
15882 return Src.getOperand(0);
15883 }
15884
15885 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15886 APFloat Val = CFP->getValueAPF();
15887 bool LosesInfo = true;
15889 if (!LosesInfo)
15890 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15891 }
15892
15893 return SDValue();
15894}
15895
15896SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15897 DAGCombinerInfo &DCI) const {
15898 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15899 "combine only useful on gfx8");
15900
15901 SDValue TruncSrc = N->getOperand(0);
15902 EVT VT = N->getValueType(0);
15903 if (VT != MVT::f16)
15904 return SDValue();
15905
15906 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15907 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15908 return SDValue();
15909
15910 SelectionDAG &DAG = DCI.DAG;
15911 SDLoc SL(N);
15912
15913 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15914 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15915 // casting back.
15916
15917 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15918 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15919 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15920 if (!A)
15921 return SDValue();
15922
15923 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15924 if (!B)
15925 return SDValue();
15926
15927 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15928 if (!C)
15929 return SDValue();
15930
15931 // This changes signaling nan behavior. If an input is a signaling nan, it
15932 // would have been quieted by the fpext originally. We don't care because
15933 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15934 // we would be worse off than just doing the promotion.
15935 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15936 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15937 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15938 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15939}
15940
15941unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15942 const SDNode *N0,
15943 const SDNode *N1) const {
15944 EVT VT = N0->getValueType(0);
15945
15946 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15947 // support denormals ever.
15948 if (((VT == MVT::f32 &&
15950 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15953 return ISD::FMAD;
15954
15955 const TargetOptions &Options = DAG.getTarget().Options;
15956 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15957 (N0->getFlags().hasAllowContract() &&
15958 N1->getFlags().hasAllowContract())) &&
15960 return ISD::FMA;
15961 }
15962
15963 return 0;
15964}
15965
15966// For a reassociatable opcode perform:
15967// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15968SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15969 SelectionDAG &DAG) const {
15970 EVT VT = N->getValueType(0);
15971 if (VT != MVT::i32 && VT != MVT::i64)
15972 return SDValue();
15973
15974 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15975 return SDValue();
15976
15977 unsigned Opc = N->getOpcode();
15978 SDValue Op0 = N->getOperand(0);
15979 SDValue Op1 = N->getOperand(1);
15980
15981 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15982 return SDValue();
15983
15984 if (Op0->isDivergent())
15985 std::swap(Op0, Op1);
15986
15987 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15988 return SDValue();
15989
15990 SDValue Op2 = Op1.getOperand(1);
15991 Op1 = Op1.getOperand(0);
15992 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15993 return SDValue();
15994
15995 if (Op1->isDivergent())
15996 std::swap(Op1, Op2);
15997
15998 SDLoc SL(N);
15999 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
16000 return DAG.getNode(Opc, SL, VT, Add1, Op2);
16001}
16002
16003static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
16004 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
16006 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
16007 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
16008 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
16009}
16010
16011// Fold
16012// y = lshr i64 x, 32
16013// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
16014// with Const.hi == -1
16015// To
16016// res = mad_u64_u32 y.lo ,Const.lo, x.lo
16018 SDValue MulLHS, SDValue MulRHS,
16019 SDValue AddRHS) {
16020 if (MulRHS.getOpcode() == ISD::SRL)
16021 std::swap(MulLHS, MulRHS);
16022
16023 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
16024 return SDValue();
16025
16026 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
16027 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
16028 MulLHS.getOperand(0) != AddRHS)
16029 return SDValue();
16030
16032 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
16033 return SDValue();
16034
16035 SDValue ConstMul =
16036 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
16037 return getMad64_32(DAG, SL, MVT::i64,
16038 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
16039 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
16040}
16041
16042// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
16043// multiplies, if any.
16044//
16045// Full 64-bit multiplies that feed into an addition are lowered here instead
16046// of using the generic expansion. The generic expansion ends up with
16047// a tree of ADD nodes that prevents us from using the "add" part of the
16048// MAD instruction. The expansion produced here results in a chain of ADDs
16049// instead of a tree.
16050SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
16051 DAGCombinerInfo &DCI) const {
16052 assert(N->isAnyAdd());
16053
16054 SelectionDAG &DAG = DCI.DAG;
16055 EVT VT = N->getValueType(0);
16056 SDLoc SL(N);
16057 SDValue LHS = N->getOperand(0);
16058 SDValue RHS = N->getOperand(1);
16059
16060 if (VT.isVector())
16061 return SDValue();
16062
16063 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16064 // result in scalar registers for uniform values.
16065 if (!N->isDivergent() && Subtarget->hasSMulHi())
16066 return SDValue();
16067
16068 unsigned NumBits = VT.getScalarSizeInBits();
16069 if (NumBits <= 32 || NumBits > 64)
16070 return SDValue();
16071
16072 if (LHS.getOpcode() != ISD::MUL) {
16073 assert(RHS.getOpcode() == ISD::MUL);
16074 std::swap(LHS, RHS);
16075 }
16076
16077 // Avoid the fold if it would unduly increase the number of multiplies due to
16078 // multiple uses, except on hardware with full-rate multiply-add (which is
16079 // part of full-rate 64-bit ops).
16080 if (!Subtarget->hasFullRate64Ops()) {
16081 unsigned NumUsers = 0;
16082 for (SDNode *User : LHS->users()) {
16083 // There is a use that does not feed into addition, so the multiply can't
16084 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16085 if (!User->isAnyAdd())
16086 return SDValue();
16087
16088 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16089 // MUL + 3xADD + 3xADDC over 3xMAD.
16090 ++NumUsers;
16091 if (NumUsers >= 3)
16092 return SDValue();
16093 }
16094 }
16095
16096 SDValue MulLHS = LHS.getOperand(0);
16097 SDValue MulRHS = LHS.getOperand(1);
16098 SDValue AddRHS = RHS;
16099
16100 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16101 return FoldedMAD;
16102
16103 // Always check whether operands are small unsigned values, since that
16104 // knowledge is useful in more cases. Check for small signed values only if
16105 // doing so can unlock a shorter code sequence.
16106 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
16107 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
16108
16109 bool MulSignedLo = false;
16110 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16111 MulSignedLo =
16112 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
16113 }
16114
16115 // The operands and final result all have the same number of bits. If
16116 // operands need to be extended, they can be extended with garbage. The
16117 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
16118 // truncated away in the end.
16119 if (VT != MVT::i64) {
16120 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
16121 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
16122 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
16123 }
16124
16125 // The basic code generated is conceptually straightforward. Pseudo code:
16126 //
16127 // accum = mad_64_32 lhs.lo, rhs.lo, accum
16128 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
16129 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
16130 //
16131 // The second and third lines are optional, depending on whether the factors
16132 // are {sign,zero}-extended or not.
16133 //
16134 // The actual DAG is noisier than the pseudo code, but only due to
16135 // instructions that disassemble values into low and high parts, and
16136 // assemble the final result.
16137 SDValue One = DAG.getConstant(1, SL, MVT::i32);
16138
16139 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
16140 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
16141 SDValue Accum =
16142 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16143
16144 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16145 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16146
16147 if (!MulLHSUnsigned32) {
16148 auto MulLHSHi =
16149 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
16150 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
16151 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16152 }
16153
16154 if (!MulRHSUnsigned32) {
16155 auto MulRHSHi =
16156 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
16157 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
16158 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16159 }
16160
16161 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
16162 Accum = DAG.getBitcast(MVT::i64, Accum);
16163 }
16164
16165 if (VT != MVT::i64)
16166 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
16167 return Accum;
16168}
16169
16170SDValue
16171SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
16172 DAGCombinerInfo &DCI) const {
16173 SDValue RHS = N->getOperand(1);
16174 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16175 if (!CRHS)
16176 return SDValue();
16177
16178 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
16179 // common.
16180 uint64_t Val = CRHS->getZExtValue();
16181 if (countr_zero(Val) >= 32) {
16182 SelectionDAG &DAG = DCI.DAG;
16183 SDLoc SL(N);
16184 SDValue LHS = N->getOperand(0);
16185
16186 // Avoid carry machinery if we know the low half of the add does not
16187 // contribute to the final result.
16188 //
16189 // add i64:x, K if computeTrailingZeros(K) >= 32
16190 // => build_pair (add x.hi, K.hi), x.lo
16191
16192 // Breaking the 64-bit add here with this strange constant is unlikely
16193 // to interfere with addressing mode patterns.
16194
16195 SDValue Hi = getHiHalf64(LHS, DAG);
16196 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
16197 unsigned Opcode = N->getOpcode();
16198 if (Opcode == ISD::PTRADD)
16199 Opcode = ISD::ADD;
16200 SDValue AddHi =
16201 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
16202
16203 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
16204 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
16205 }
16206
16207 return SDValue();
16208}
16209
16210// Collect the ultimate src of each of the mul node's operands, and confirm
16211// each operand is 8 bytes.
16212static std::optional<ByteProvider<SDValue>>
16213handleMulOperand(const SDValue &MulOperand) {
16214 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
16215 if (!Byte0 || Byte0->isConstantZero()) {
16216 return std::nullopt;
16217 }
16218 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
16219 if (Byte1 && !Byte1->isConstantZero()) {
16220 return std::nullopt;
16221 }
16222 return Byte0;
16223}
16224
16225static unsigned addPermMasks(unsigned First, unsigned Second) {
16226 unsigned FirstCs = First & 0x0c0c0c0c;
16227 unsigned SecondCs = Second & 0x0c0c0c0c;
16228 unsigned FirstNoCs = First & ~0x0c0c0c0c;
16229 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16230
16231 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16232 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16233 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16234 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16235
16236 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16237}
16238
16239struct DotSrc {
16241 int64_t PermMask;
16243};
16244
16248 SmallVectorImpl<DotSrc> &Src1s, int Step) {
16249
16250 assert(Src0.Src.has_value() && Src1.Src.has_value());
16251 // Src0s and Src1s are empty, just place arbitrarily.
16252 if (Step == 0) {
16253 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
16254 Src0.SrcOffset / 4});
16255 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
16256 Src1.SrcOffset / 4});
16257 return;
16258 }
16259
16260 for (int BPI = 0; BPI < 2; BPI++) {
16261 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
16262 if (BPI == 1) {
16263 BPP = {Src1, Src0};
16264 }
16265 unsigned ZeroMask = 0x0c0c0c0c;
16266 unsigned FMask = 0xFF << (8 * (3 - Step));
16267
16268 unsigned FirstMask =
16269 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16270 unsigned SecondMask =
16271 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16272 // Attempt to find Src vector which contains our SDValue, if so, add our
16273 // perm mask to the existing one. If we are unable to find a match for the
16274 // first SDValue, attempt to find match for the second.
16275 int FirstGroup = -1;
16276 for (int I = 0; I < 2; I++) {
16277 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
16278 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
16279 return IterElt.SrcOp == *BPP.first.Src &&
16280 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16281 };
16282
16283 auto *Match = llvm::find_if(Srcs, MatchesFirst);
16284 if (Match != Srcs.end()) {
16285 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
16286 FirstGroup = I;
16287 break;
16288 }
16289 }
16290 if (FirstGroup != -1) {
16291 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
16292 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
16293 return IterElt.SrcOp == *BPP.second.Src &&
16294 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16295 };
16296 auto *Match = llvm::find_if(Srcs, MatchesSecond);
16297 if (Match != Srcs.end()) {
16298 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
16299 } else
16300 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16301 return;
16302 }
16303 }
16304
16305 // If we have made it here, then we could not find a match in Src0s or Src1s
16306 // for either Src0 or Src1, so just place them arbitrarily.
16307
16308 unsigned ZeroMask = 0x0c0c0c0c;
16309 unsigned FMask = 0xFF << (8 * (3 - Step));
16310
16311 Src0s.push_back(
16312 {*Src0.Src,
16313 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16314 Src0.SrcOffset / 4});
16315 Src1s.push_back(
16316 {*Src1.Src,
16317 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16318 Src1.SrcOffset / 4});
16319}
16320
16322 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
16323 bool IsAny) {
16324
16325 // If we just have one source, just permute it accordingly.
16326 if (Srcs.size() == 1) {
16327 auto *Elt = Srcs.begin();
16328 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
16329
16330 // v_perm will produce the original value
16331 if (Elt->PermMask == 0x3020100)
16332 return EltOp;
16333
16334 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16335 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
16336 }
16337
16338 auto *FirstElt = Srcs.begin();
16339 auto *SecondElt = std::next(FirstElt);
16340
16342
16343 // If we have multiple sources in the chain, combine them via perms (using
16344 // calculated perm mask) and Ors.
16345 while (true) {
16346 auto FirstMask = FirstElt->PermMask;
16347 auto SecondMask = SecondElt->PermMask;
16348
16349 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16350 unsigned FirstPlusFour = FirstMask | 0x04040404;
16351 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16352 // original 0x0C.
16353 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16354
16355 auto PermMask = addPermMasks(FirstMask, SecondMask);
16356 auto FirstVal =
16357 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16358 auto SecondVal =
16359 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
16360
16361 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
16362 SecondVal,
16363 DAG.getConstant(PermMask, SL, MVT::i32)));
16364
16365 FirstElt = std::next(SecondElt);
16366 if (FirstElt == Srcs.end())
16367 break;
16368
16369 SecondElt = std::next(FirstElt);
16370 // If we only have a FirstElt, then just combine that into the cumulative
16371 // source node.
16372 if (SecondElt == Srcs.end()) {
16373 auto EltOp =
16374 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16375
16376 Perms.push_back(
16377 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16378 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
16379 break;
16380 }
16381 }
16382
16383 assert(Perms.size() == 1 || Perms.size() == 2);
16384 return Perms.size() == 2
16385 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
16386 : Perms[0];
16387}
16388
16389static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16390 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16391 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16392 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16393 EntryMask += ZeroMask;
16394 }
16395}
16396
16397static bool isMul(const SDValue Op) {
16398 auto Opcode = Op.getOpcode();
16399
16400 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16401 Opcode == AMDGPUISD::MUL_I24);
16402}
16403
16404static std::optional<bool>
16406 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16407 const SDValue &S1Op, const SelectionDAG &DAG) {
16408 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16409 // of the dot4 is irrelevant.
16410 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16411 return false;
16412
16413 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16414 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16415 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16416 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16417 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16418 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16419
16420 assert(!(S0IsUnsigned && S0IsSigned));
16421 assert(!(S1IsUnsigned && S1IsSigned));
16422
16423 // There are 9 possible permutations of
16424 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16425
16426 // In two permutations, the sign bits are known to be the same for both Ops,
16427 // so simply return Signed / Unsigned corresponding to the MSB
16428
16429 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16430 return S0IsSigned;
16431
16432 // In another two permutations, the sign bits are known to be opposite. In
16433 // this case return std::nullopt to indicate a bad match.
16434
16435 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16436 return std::nullopt;
16437
16438 // In the remaining five permutations, we don't know the value of the sign
16439 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16440 // the upper bits must be extension bits. Thus, the only ways for the sign
16441 // bit to be unknown is if it was sign extended from unknown value, or if it
16442 // was any extended. In either case, it is correct to use the signed
16443 // version of the signedness semantics of dot4
16444
16445 // In two of such permutations, we known the sign bit is set for
16446 // one op, and the other is unknown. It is okay to used signed version of
16447 // dot4.
16448 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16449 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16450 return true;
16451
16452 // In one such permutation, we don't know either of the sign bits. It is okay
16453 // to used the signed version of dot4.
16454 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16455 return true;
16456
16457 // In two of such permutations, we known the sign bit is unset for
16458 // one op, and the other is unknown. Return std::nullopt to indicate a
16459 // bad match.
16460 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16461 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16462 return std::nullopt;
16463
16464 llvm_unreachable("Fully covered condition");
16465}
16466
16467SDValue SITargetLowering::performAddCombine(SDNode *N,
16468 DAGCombinerInfo &DCI) const {
16469 SelectionDAG &DAG = DCI.DAG;
16470 EVT VT = N->getValueType(0);
16471 SDLoc SL(N);
16472 SDValue LHS = N->getOperand(0);
16473 SDValue RHS = N->getOperand(1);
16474
16475 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16476 if (Subtarget->hasMad64_32()) {
16477 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16478 return Folded;
16479 }
16480 }
16481
16482 if (SDValue V = reassociateScalarOps(N, DAG)) {
16483 return V;
16484 }
16485
16486 if (VT == MVT::i64) {
16487 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16488 return Folded;
16489 }
16490
16491 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16492 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16493 SDValue TempNode(N, 0);
16494 std::optional<bool> IsSigned;
16498
16499 // Match the v_dot4 tree, while collecting src nodes.
16500 int ChainLength = 0;
16501 for (int I = 0; I < 4; I++) {
16502 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16503 if (MulIdx == -1)
16504 break;
16505 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16506 if (!Src0)
16507 break;
16508 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16509 if (!Src1)
16510 break;
16511
16512 auto IterIsSigned = checkDot4MulSignedness(
16513 TempNode->getOperand(MulIdx), *Src0, *Src1,
16514 TempNode->getOperand(MulIdx)->getOperand(0),
16515 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16516 if (!IterIsSigned)
16517 break;
16518 if (!IsSigned)
16519 IsSigned = *IterIsSigned;
16520 if (*IterIsSigned != *IsSigned)
16521 break;
16522 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16523 auto AddIdx = 1 - MulIdx;
16524 // Allow the special case where add (add (mul24, 0), mul24) became ->
16525 // add (mul24, mul24).
16526 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16527 Src2s.push_back(TempNode->getOperand(AddIdx));
16528 auto Src0 =
16529 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16530 if (!Src0)
16531 break;
16532 auto Src1 =
16533 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16534 if (!Src1)
16535 break;
16536 auto IterIsSigned = checkDot4MulSignedness(
16537 TempNode->getOperand(AddIdx), *Src0, *Src1,
16538 TempNode->getOperand(AddIdx)->getOperand(0),
16539 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16540 if (!IterIsSigned)
16541 break;
16542 assert(IsSigned);
16543 if (*IterIsSigned != *IsSigned)
16544 break;
16545 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16546 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16547 ChainLength = I + 2;
16548 break;
16549 }
16550
16551 TempNode = TempNode->getOperand(AddIdx);
16552 Src2s.push_back(TempNode);
16553 ChainLength = I + 1;
16554 if (TempNode->getNumOperands() < 2)
16555 break;
16556 LHS = TempNode->getOperand(0);
16557 RHS = TempNode->getOperand(1);
16558 }
16559
16560 if (ChainLength < 2)
16561 return SDValue();
16562
16563 // Masks were constructed with assumption that we would find a chain of
16564 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16565 // 0x0c) so they do not affect dot calculation.
16566 if (ChainLength < 4) {
16567 fixMasks(Src0s, ChainLength);
16568 fixMasks(Src1s, ChainLength);
16569 }
16570
16571 SDValue Src0, Src1;
16572
16573 // If we are just using a single source for both, and have permuted the
16574 // bytes consistently, we can just use the sources without permuting
16575 // (commutation).
16576 bool UseOriginalSrc = false;
16577 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16578 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16579 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16580 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16581 SmallVector<unsigned, 4> SrcBytes;
16582 auto Src0Mask = Src0s.begin()->PermMask;
16583 SrcBytes.push_back(Src0Mask & 0xFF000000);
16584 bool UniqueEntries = true;
16585 for (auto I = 1; I < 4; I++) {
16586 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16587
16588 if (is_contained(SrcBytes, NextByte)) {
16589 UniqueEntries = false;
16590 break;
16591 }
16592 SrcBytes.push_back(NextByte);
16593 }
16594
16595 if (UniqueEntries) {
16596 UseOriginalSrc = true;
16597
16598 auto *FirstElt = Src0s.begin();
16599 auto FirstEltOp =
16600 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16601
16602 auto *SecondElt = Src1s.begin();
16603 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16604 SecondElt->DWordOffset);
16605
16606 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16607 MVT::getIntegerVT(32));
16608 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16609 MVT::getIntegerVT(32));
16610 }
16611 }
16612
16613 if (!UseOriginalSrc) {
16614 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16615 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16616 }
16617
16618 assert(IsSigned);
16619 SDValue Src2 =
16620 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16621
16622 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16623 : Intrinsic::amdgcn_udot4,
16624 SL, MVT::i64);
16625
16626 assert(!VT.isVector());
16627 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16628 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16629
16630 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16631 }
16632
16633 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16634 return SDValue();
16635
16636 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16637 // add x, sext (setcc) => usubo_carry x, 0, setcc
16638 unsigned Opc = LHS.getOpcode();
16641 std::swap(RHS, LHS);
16642
16643 Opc = RHS.getOpcode();
16644 switch (Opc) {
16645 default:
16646 break;
16647 case ISD::ZERO_EXTEND:
16648 case ISD::SIGN_EXTEND:
16649 case ISD::ANY_EXTEND: {
16650 auto Cond = RHS.getOperand(0);
16651 // If this won't be a real VOPC output, we would still need to insert an
16652 // extra instruction anyway.
16653 if (!isBoolSGPR(Cond))
16654 break;
16655 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16656 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16658 return DAG.getNode(Opc, SL, VTList, Args);
16659 }
16660 case ISD::UADDO_CARRY: {
16661 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16662 if (!isNullConstant(RHS.getOperand(1)))
16663 break;
16664 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16665 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16666 }
16667 }
16668 return SDValue();
16669}
16670
16671SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16672 DAGCombinerInfo &DCI) const {
16673 SelectionDAG &DAG = DCI.DAG;
16674 SDLoc DL(N);
16675 EVT VT = N->getValueType(0);
16676 SDValue N0 = N->getOperand(0);
16677 SDValue N1 = N->getOperand(1);
16678
16679 // The following folds transform PTRADDs into regular arithmetic in cases
16680 // where the PTRADD wouldn't be folded as an immediate offset into memory
16681 // instructions anyway. They are target-specific in that other targets might
16682 // prefer to not lose information about the pointer arithmetic.
16683
16684 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16685 // Adapted from DAGCombiner::visitADDLikeCommutative.
16686 SDValue V, K;
16687 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16688 SDNodeFlags ShlFlags = N1->getFlags();
16689 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16690 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16691 // preserved.
16692 SDNodeFlags NewShlFlags =
16693 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16695 : SDNodeFlags();
16696 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16697 DCI.AddToWorklist(Inner.getNode());
16698 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16699 }
16700
16701 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16702 // performAddCombine.
16703 if (N1.getOpcode() == ISD::MUL) {
16704 if (Subtarget->hasMad64_32()) {
16705 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16706 return Folded;
16707 }
16708 }
16709
16710 // If the 32 low bits of the constant are all zero, there is nothing to fold
16711 // into an immediate offset, so it's better to eliminate the unnecessary
16712 // addition for the lower 32 bits than to preserve the PTRADD.
16713 // Analogous to a fold in performAddCombine.
16714 if (VT == MVT::i64) {
16715 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16716 return Folded;
16717 }
16718
16719 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16720 return SDValue();
16721
16722 SDValue X = N0;
16723 SDValue Y = N1.getOperand(0);
16724 SDValue Z = N1.getOperand(1);
16725 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16726 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16727
16728 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16729 Y->isDivergent() != Z->isDivergent()) {
16730 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16731 // y are uniform and z isn't.
16732 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16733 // z are uniform and y isn't.
16734 // The goal is to push uniform operands up in the computation, so that they
16735 // can be handled with scalar operations. We can't use reassociateScalarOps
16736 // for this since it requires two identical commutative operations to
16737 // reassociate.
16738 if (Y->isDivergent())
16739 std::swap(Y, Z);
16740 // If both additions in the original were NUW, reassociation preserves that.
16741 SDNodeFlags ReassocFlags =
16742 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16743 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16744 DCI.AddToWorklist(UniformInner.getNode());
16745 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16746 }
16747
16748 return SDValue();
16749}
16750
16751SDValue SITargetLowering::performSubCombine(SDNode *N,
16752 DAGCombinerInfo &DCI) const {
16753 SelectionDAG &DAG = DCI.DAG;
16754 EVT VT = N->getValueType(0);
16755
16756 if (VT == MVT::i64) {
16757 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16758 return Folded;
16759 }
16760
16761 if (VT != MVT::i32)
16762 return SDValue();
16763
16764 SDLoc SL(N);
16765 SDValue LHS = N->getOperand(0);
16766 SDValue RHS = N->getOperand(1);
16767
16768 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16769 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16770 unsigned Opc = RHS.getOpcode();
16771 switch (Opc) {
16772 default:
16773 break;
16774 case ISD::ZERO_EXTEND:
16775 case ISD::SIGN_EXTEND:
16776 case ISD::ANY_EXTEND: {
16777 auto Cond = RHS.getOperand(0);
16778 // If this won't be a real VOPC output, we would still need to insert an
16779 // extra instruction anyway.
16780 if (!isBoolSGPR(Cond))
16781 break;
16782 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16783 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16785 return DAG.getNode(Opc, SL, VTList, Args);
16786 }
16787 }
16788
16789 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16790 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16791 if (!isNullConstant(LHS.getOperand(1)))
16792 return SDValue();
16793 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16794 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16795 }
16796 return SDValue();
16797}
16798
16799SDValue
16800SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16801 DAGCombinerInfo &DCI) const {
16802
16803 if (N->getValueType(0) != MVT::i32)
16804 return SDValue();
16805
16806 if (!isNullConstant(N->getOperand(1)))
16807 return SDValue();
16808
16809 SelectionDAG &DAG = DCI.DAG;
16810 SDValue LHS = N->getOperand(0);
16811
16812 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16813 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16814 unsigned LHSOpc = LHS.getOpcode();
16815 unsigned Opc = N->getOpcode();
16816 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16817 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16818 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16819 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16820 }
16821 return SDValue();
16822}
16823
16824SDValue SITargetLowering::performFAddCombine(SDNode *N,
16825 DAGCombinerInfo &DCI) const {
16826 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16827 return SDValue();
16828
16829 SelectionDAG &DAG = DCI.DAG;
16830 EVT VT = N->getValueType(0);
16831
16832 SDLoc SL(N);
16833 SDValue LHS = N->getOperand(0);
16834 SDValue RHS = N->getOperand(1);
16835
16836 // These should really be instruction patterns, but writing patterns with
16837 // source modifiers is a pain.
16838
16839 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16840 if (LHS.getOpcode() == ISD::FADD) {
16841 SDValue A = LHS.getOperand(0);
16842 if (A == LHS.getOperand(1)) {
16843 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16844 if (FusedOp != 0) {
16845 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16846 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16847 }
16848 }
16849 }
16850
16851 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16852 if (RHS.getOpcode() == ISD::FADD) {
16853 SDValue A = RHS.getOperand(0);
16854 if (A == RHS.getOperand(1)) {
16855 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16856 if (FusedOp != 0) {
16857 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16858 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16859 }
16860 }
16861 }
16862
16863 return SDValue();
16864}
16865
16866SDValue SITargetLowering::performFSubCombine(SDNode *N,
16867 DAGCombinerInfo &DCI) const {
16868 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16869 return SDValue();
16870
16871 SelectionDAG &DAG = DCI.DAG;
16872 SDLoc SL(N);
16873 EVT VT = N->getValueType(0);
16874 assert(!VT.isVector());
16875
16876 // Try to get the fneg to fold into the source modifier. This undoes generic
16877 // DAG combines and folds them into the mad.
16878 //
16879 // Only do this if we are not trying to support denormals. v_mad_f32 does
16880 // not support denormals ever.
16881 SDValue LHS = N->getOperand(0);
16882 SDValue RHS = N->getOperand(1);
16883 if (LHS.getOpcode() == ISD::FADD) {
16884 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16885 SDValue A = LHS.getOperand(0);
16886 if (A == LHS.getOperand(1)) {
16887 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16888 if (FusedOp != 0) {
16889 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16890 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16891
16892 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16893 }
16894 }
16895 }
16896
16897 if (RHS.getOpcode() == ISD::FADD) {
16898 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16899
16900 SDValue A = RHS.getOperand(0);
16901 if (A == RHS.getOperand(1)) {
16902 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16903 if (FusedOp != 0) {
16904 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16905 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16906 }
16907 }
16908 }
16909
16910 return SDValue();
16911}
16912
16913SDValue SITargetLowering::performFDivCombine(SDNode *N,
16914 DAGCombinerInfo &DCI) const {
16915 SelectionDAG &DAG = DCI.DAG;
16916 SDLoc SL(N);
16917 EVT VT = N->getValueType(0);
16918
16919 // fsqrt legality correlates to rsq availability.
16920 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
16921 return SDValue();
16922
16923 SDValue LHS = N->getOperand(0);
16924 SDValue RHS = N->getOperand(1);
16925
16926 SDNodeFlags Flags = N->getFlags();
16927 SDNodeFlags RHSFlags = RHS->getFlags();
16928 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16929 !RHS->hasOneUse())
16930 return SDValue();
16931
16932 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16933 bool IsNegative = false;
16934 if (CLHS->isExactlyValue(1.0) ||
16935 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16936 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16937 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16938 if (RHS.getOpcode() == ISD::FSQRT) {
16939 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16940 SDValue Rsq =
16941 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16942 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16943 }
16944 }
16945 }
16946
16947 return SDValue();
16948}
16949
16950SDValue SITargetLowering::performFMulCombine(SDNode *N,
16951 DAGCombinerInfo &DCI) const {
16952 SelectionDAG &DAG = DCI.DAG;
16953 EVT VT = N->getValueType(0);
16954 EVT ScalarVT = VT.getScalarType();
16955 EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
16956
16957 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16958 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16959 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16960 return SDValue();
16961 }
16962
16963 SDValue LHS = N->getOperand(0);
16964 SDValue RHS = N->getOperand(1);
16965
16966 // It is cheaper to realize i32 inline constants as compared against
16967 // materializing f16 or f64 (or even non-inline f32) values,
16968 // possible via ldexp usage, as shown below :
16969 //
16970 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16971 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16972 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16973 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16974 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16975 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16976 if (!TrueNode)
16977 return SDValue();
16978 const ConstantFPSDNode *FalseNode =
16979 isConstOrConstSplatFP(RHS.getOperand(2));
16980 if (!FalseNode)
16981 return SDValue();
16982
16983 if (TrueNode->isNegative() != FalseNode->isNegative())
16984 return SDValue();
16985
16986 // For f32, only non-inline constants should be transformed.
16987 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16988 if (ScalarVT == MVT::f32 &&
16989 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16990 TII->isInlineConstant(FalseNode->getValueAPF()))
16991 return SDValue();
16992
16993 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16994 if (TrueNodeExpVal == INT_MIN)
16995 return SDValue();
16996 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16997 if (FalseNodeExpVal == INT_MIN)
16998 return SDValue();
16999
17000 SDLoc SL(N);
17001 SDValue SelectNode =
17002 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
17003 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
17004 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
17005
17006 LHS = TrueNode->isNegative()
17007 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
17008 : LHS;
17009
17010 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
17011 }
17012
17013 return SDValue();
17014}
17015
17016SDValue SITargetLowering::performFMACombine(SDNode *N,
17017 DAGCombinerInfo &DCI) const {
17018 SelectionDAG &DAG = DCI.DAG;
17019 EVT VT = N->getValueType(0);
17020 SDLoc SL(N);
17021
17022 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17023 return SDValue();
17024
17025 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
17026 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
17027 SDValue Op1 = N->getOperand(0);
17028 SDValue Op2 = N->getOperand(1);
17029 SDValue FMA = N->getOperand(2);
17030
17031 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
17032 Op2.getOpcode() != ISD::FP_EXTEND)
17033 return SDValue();
17034
17035 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
17036 // regardless of the denorm mode setting. Therefore,
17037 // fp-contract is sufficient to allow generating fdot2.
17038 const TargetOptions &Options = DAG.getTarget().Options;
17039 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17040 (N->getFlags().hasAllowContract() &&
17041 FMA->getFlags().hasAllowContract())) {
17042 Op1 = Op1.getOperand(0);
17043 Op2 = Op2.getOperand(0);
17044 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17046 return SDValue();
17047
17048 SDValue Vec1 = Op1.getOperand(0);
17049 SDValue Idx1 = Op1.getOperand(1);
17050 SDValue Vec2 = Op2.getOperand(0);
17051
17052 SDValue FMAOp1 = FMA.getOperand(0);
17053 SDValue FMAOp2 = FMA.getOperand(1);
17054 SDValue FMAAcc = FMA.getOperand(2);
17055
17056 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
17057 FMAOp2.getOpcode() != ISD::FP_EXTEND)
17058 return SDValue();
17059
17060 FMAOp1 = FMAOp1.getOperand(0);
17061 FMAOp2 = FMAOp2.getOperand(0);
17062 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17064 return SDValue();
17065
17066 SDValue Vec3 = FMAOp1.getOperand(0);
17067 SDValue Vec4 = FMAOp2.getOperand(0);
17068 SDValue Idx2 = FMAOp1.getOperand(1);
17069
17070 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
17071 // Idx1 and Idx2 cannot be the same.
17072 Idx1 == Idx2)
17073 return SDValue();
17074
17075 if (Vec1 == Vec2 || Vec3 == Vec4)
17076 return SDValue();
17077
17078 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
17079 return SDValue();
17080
17081 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17082 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
17083 DAG.getTargetConstant(0, SL, MVT::i1));
17084 }
17085 }
17086 return SDValue();
17087}
17088
17089SDValue SITargetLowering::performSetCCCombine(SDNode *N,
17090 DAGCombinerInfo &DCI) const {
17091 SelectionDAG &DAG = DCI.DAG;
17092 SDLoc SL(N);
17093
17094 SDValue LHS = N->getOperand(0);
17095 SDValue RHS = N->getOperand(1);
17096 EVT VT = LHS.getValueType();
17097 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
17098
17099 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
17100 if (!CRHS) {
17102 if (CRHS) {
17103 std::swap(LHS, RHS);
17104 CC = getSetCCSwappedOperands(CC);
17105 }
17106 }
17107
17108 if (CRHS) {
17109 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
17110 isBoolSGPR(LHS.getOperand(0))) {
17111 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
17112 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
17113 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
17114 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
17115 if ((CRHS->isAllOnes() &&
17116 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
17117 (CRHS->isZero() &&
17118 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
17119 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
17120 DAG.getAllOnesConstant(SL, MVT::i1));
17121 if ((CRHS->isAllOnes() &&
17122 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
17123 (CRHS->isZero() &&
17124 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
17125 return LHS.getOperand(0);
17126 }
17127
17128 const APInt &CRHSVal = CRHS->getAPIntValue();
17129 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
17130 LHS.getOpcode() == ISD::SELECT &&
17131 isa<ConstantSDNode>(LHS.getOperand(1)) &&
17132 isa<ConstantSDNode>(LHS.getOperand(2)) &&
17133 isBoolSGPR(LHS.getOperand(0))) {
17134 // Given CT != FT:
17135 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
17136 // setcc (select cc, CT, CF), CF, ne => cc
17137 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
17138 // setcc (select cc, CT, CF), CT, eq => cc
17139 const APInt &CT = LHS.getConstantOperandAPInt(1);
17140 const APInt &CF = LHS.getConstantOperandAPInt(2);
17141
17142 if (CT != CF) {
17143 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
17144 (CT == CRHSVal && CC == ISD::SETNE))
17145 return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);
17146 if ((CF == CRHSVal && CC == ISD::SETNE) ||
17147 (CT == CRHSVal && CC == ISD::SETEQ))
17148 return LHS.getOperand(0);
17149 }
17150 }
17151 }
17152
17153 // Truncate 64-bit setcc to test only upper 32-bits of its operands in the
17154 // following cases where information about the lower 32-bits of its operands
17155 // is known:
17156 //
17157 // If LHS.lo32 == RHS.lo32:
17158 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
17159 // If LHS.lo32 != RHS.lo32:
17160 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
17161 // If LHS.lo32 >= RHS.lo32 (unsigned):
17162 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
17163 // If LHS.lo32 > RHS.lo32 (unsigned):
17164 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
17165 // If LHS.lo32 <= RHS.lo32 (unsigned):
17166 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
17167 // If LHS.lo32 < RHS.lo32 (unsigned):
17168 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
17169 if (VT == MVT::i64) {
17170 const KnownBits LHSKnownLo32 = DAG.computeKnownBits(LHS).trunc(32);
17171 const KnownBits RHSKnownLo32 = DAG.computeKnownBits(RHS).trunc(32);
17172
17173 // NewCC is valid iff we can truncate the setcc to only test the upper 32
17174 // bits
17176
17177 switch (CC) {
17178 default:
17179 break;
17180 case ISD::SETEQ: {
17181 const std::optional<bool> KnownEq =
17182 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
17183 if (KnownEq)
17184 NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;
17185
17186 break;
17187 }
17188 case ISD::SETNE: {
17189 const std::optional<bool> KnownEq =
17190 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
17191 if (KnownEq)
17192 NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;
17193
17194 break;
17195 }
17196 case ISD::SETULT:
17197 case ISD::SETUGE:
17198 case ISD::SETLT:
17199 case ISD::SETGE: {
17200 const std::optional<bool> KnownUge =
17201 KnownBits::uge(LHSKnownLo32, RHSKnownLo32);
17202 if (KnownUge) {
17203 if (*KnownUge) {
17204 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
17205 NewCC = CC;
17206 } else {
17207 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
17208 NewCC = CC == ISD::SETULT ? ISD::SETULE
17209 : CC == ISD::SETUGE ? ISD::SETUGT
17210 : CC == ISD::SETLT ? ISD::SETLE
17211 : ISD::SETGT;
17212 }
17213 }
17214 break;
17215 }
17216 case ISD::SETULE:
17217 case ISD::SETUGT:
17218 case ISD::SETLE:
17219 case ISD::SETGT: {
17220 const std::optional<bool> KnownUle =
17221 KnownBits::ule(LHSKnownLo32, RHSKnownLo32);
17222 if (KnownUle) {
17223 if (*KnownUle) {
17224 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
17225 NewCC = CC;
17226 } else {
17227 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
17228 NewCC = CC == ISD::SETULE ? ISD::SETULT
17229 : CC == ISD::SETUGT ? ISD::SETUGE
17230 : CC == ISD::SETLE ? ISD::SETLT
17231 : ISD::SETGE;
17232 }
17233 }
17234 break;
17235 }
17236 }
17237
17238 if (NewCC != ISD::SETCC_INVALID)
17239 return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
17240 getHiHalf64(RHS, DAG), NewCC);
17241 }
17242
17243 // Eliminate setcc by using carryout from add/sub instruction
17244
17245 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
17246 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
17247 // similarly for subtraction
17248
17249 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
17250 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
17251
17252 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
17254 (CC == ISD::SETUGT &&
17256 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
17257 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
17258 bool IsAdd = LHS.getOpcode() == ISD::ADD;
17259
17260 SDValue Op0 = LHS.getOperand(0);
17261 SDValue Op1 = LHS.getOperand(1);
17262
17263 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
17264 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
17265
17266 SDValue Op0Hi = getHiHalf64(Op0, DAG);
17267 SDValue Op1Hi = getHiHalf64(Op1, DAG);
17268
17269 SDValue NodeLo =
17270 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
17271 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
17272
17273 SDValue CarryInHi = NodeLo.getValue(1);
17274 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
17275 SL, DAG.getVTList(MVT::i32, MVT::i1),
17276 {Op0Hi, Op1Hi, CarryInHi});
17277
17278 SDValue ResultLo = NodeLo.getValue(0);
17279 SDValue ResultHi = NodeHi.getValue(0);
17280
17281 SDValue JoinedResult =
17282 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
17283
17284 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
17285 SDValue Overflow = NodeHi.getValue(1);
17286 DCI.CombineTo(LHS.getNode(), Result);
17287 return Overflow;
17288 }
17289
17290 if (VT != MVT::f32 && VT != MVT::f64 &&
17291 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17292 return SDValue();
17293
17294 // Match isinf/isfinite pattern
17295 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
17296 // (fcmp one (fabs x), inf) -> (fp_class x,
17297 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
17298 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
17299 LHS.getOpcode() == ISD::FABS) {
17300 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
17301 if (!CRHS)
17302 return SDValue();
17303
17304 const APFloat &APF = CRHS->getValueAPF();
17305 if (APF.isInfinity() && !APF.isNegative()) {
17306 const unsigned IsInfMask =
17308 const unsigned IsFiniteMask =
17312 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
17313 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
17314 DAG.getConstant(Mask, SL, MVT::i32));
17315 }
17316 }
17317
17318 return SDValue();
17319}
17320
17321SDValue
17322SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
17323 DAGCombinerInfo &DCI) const {
17324 SelectionDAG &DAG = DCI.DAG;
17325 SDLoc SL(N);
17326 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17327
17328 SDValue Src = N->getOperand(0);
17329 SDValue Shift = N->getOperand(0);
17330
17331 // TODO: Extend type shouldn't matter (assuming legal types).
17332 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
17333 Shift = Shift.getOperand(0);
17334
17335 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
17336 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
17337 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
17338 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
17339 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
17340 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
17341 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
17342 SDValue Shifted = DAG.getZExtOrTrunc(
17343 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
17344
17345 unsigned ShiftOffset = 8 * Offset;
17346 if (Shift.getOpcode() == ISD::SHL)
17347 ShiftOffset -= C->getZExtValue();
17348 else
17349 ShiftOffset += C->getZExtValue();
17350
17351 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17352 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
17353 MVT::f32, Shifted);
17354 }
17355 }
17356 }
17357
17358 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17359 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
17360 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
17361 // We simplified Src. If this node is not dead, visit it again so it is
17362 // folded properly.
17363 if (N->getOpcode() != ISD::DELETED_NODE)
17364 DCI.AddToWorklist(N);
17365 return SDValue(N, 0);
17366 }
17367
17368 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
17369 if (SDValue DemandedSrc =
17370 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
17371 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
17372
17373 return SDValue();
17374}
17375
17376SDValue SITargetLowering::performClampCombine(SDNode *N,
17377 DAGCombinerInfo &DCI) const {
17378 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
17379 if (!CSrc)
17380 return SDValue();
17381
17382 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17383 const APFloat &F = CSrc->getValueAPF();
17384 APFloat Zero = APFloat::getZero(F.getSemantics());
17385 if (F < Zero ||
17386 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17387 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
17388 }
17389
17390 APFloat One(F.getSemantics(), "1.0");
17391 if (F > One)
17392 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
17393
17394 return SDValue(CSrc, 0);
17395}
17396
17397SDValue SITargetLowering::performSelectCombine(SDNode *N,
17398 DAGCombinerInfo &DCI) const {
17399
17400 // Try to fold CMP + SELECT patterns with shared constants (both FP and
17401 // integer).
17402 // Detect when CMP and SELECT use the same constant and fold them to avoid
17403 // loading the constant twice. Specifically handles patterns like:
17404 // %cmp = icmp eq i32 %val, 4242
17405 // %sel = select i1 %cmp, i32 4242, i32 %other
17406 // It can be optimized to reuse %val instead of 4242 in select.
17407 SDValue Cond = N->getOperand(0);
17408 SDValue TrueVal = N->getOperand(1);
17409 SDValue FalseVal = N->getOperand(2);
17410
17411 // Check if condition is a comparison.
17412 if (Cond.getOpcode() != ISD::SETCC)
17413 return SDValue();
17414
17415 SDValue LHS = Cond.getOperand(0);
17416 SDValue RHS = Cond.getOperand(1);
17417 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
17418
17419 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17420 bool isInteger = LHS.getValueType().isInteger();
17421
17422 // Handle simple floating-point and integer types only.
17423 if (!isFloatingPoint && !isInteger)
17424 return SDValue();
17425
17426 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17427 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17428 if (!isEquality && !isNonEquality)
17429 return SDValue();
17430
17431 SDValue ArgVal, ConstVal;
17432 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
17433 (isInteger && isa<ConstantSDNode>(RHS))) {
17434 ConstVal = RHS;
17435 ArgVal = LHS;
17436 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
17437 (isInteger && isa<ConstantSDNode>(LHS))) {
17438 ConstVal = LHS;
17439 ArgVal = RHS;
17440 } else {
17441 return SDValue();
17442 }
17443
17444 // Skip optimization for inlinable immediates.
17445 if (isFloatingPoint) {
17446 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
17447 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17448 return SDValue();
17449 } else {
17451 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
17452 return SDValue();
17453 }
17454
17455 // For equality and non-equality comparisons, patterns:
17456 // select (setcc x, const), const, y -> select (setcc x, const), x, y
17457 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17458 if (!(isEquality && TrueVal == ConstVal) &&
17459 !(isNonEquality && FalseVal == ConstVal))
17460 return SDValue();
17461
17462 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17463 SDValue SelectRHS =
17464 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17465 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
17466 SelectLHS, SelectRHS);
17467}
17468
17470 DAGCombinerInfo &DCI) const {
17471 switch (N->getOpcode()) {
17472 case ISD::ADD:
17473 case ISD::SUB:
17474 case ISD::SHL:
17475 case ISD::SRL:
17476 case ISD::SRA:
17477 case ISD::AND:
17478 case ISD::OR:
17479 case ISD::XOR:
17480 case ISD::MUL:
17481 case ISD::SETCC:
17482 case ISD::SELECT:
17483 case ISD::SMIN:
17484 case ISD::SMAX:
17485 case ISD::UMIN:
17486 case ISD::UMAX:
17487 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
17488 return Res;
17489 break;
17490 default:
17491 break;
17492 }
17493
17494 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17495 return SDValue();
17496
17497 switch (N->getOpcode()) {
17498 case ISD::ADD:
17499 return performAddCombine(N, DCI);
17500 case ISD::PTRADD:
17501 return performPtrAddCombine(N, DCI);
17502 case ISD::SUB:
17503 return performSubCombine(N, DCI);
17504 case ISD::UADDO_CARRY:
17505 case ISD::USUBO_CARRY:
17506 return performAddCarrySubCarryCombine(N, DCI);
17507 case ISD::FADD:
17508 return performFAddCombine(N, DCI);
17509 case ISD::FSUB:
17510 return performFSubCombine(N, DCI);
17511 case ISD::FDIV:
17512 return performFDivCombine(N, DCI);
17513 case ISD::FMUL:
17514 return performFMulCombine(N, DCI);
17515 case ISD::SETCC:
17516 return performSetCCCombine(N, DCI);
17517 case ISD::SELECT:
17518 if (auto Res = performSelectCombine(N, DCI))
17519 return Res;
17520 break;
17521 case ISD::FMAXNUM:
17522 case ISD::FMINNUM:
17523 case ISD::FMAXNUM_IEEE:
17524 case ISD::FMINNUM_IEEE:
17525 case ISD::FMAXIMUM:
17526 case ISD::FMINIMUM:
17527 case ISD::FMAXIMUMNUM:
17528 case ISD::FMINIMUMNUM:
17529 case ISD::SMAX:
17530 case ISD::SMIN:
17531 case ISD::UMAX:
17532 case ISD::UMIN:
17533 case AMDGPUISD::FMIN_LEGACY:
17534 case AMDGPUISD::FMAX_LEGACY:
17535 return performMinMaxCombine(N, DCI);
17536 case ISD::FMA:
17537 return performFMACombine(N, DCI);
17538 case ISD::AND:
17539 return performAndCombine(N, DCI);
17540 case ISD::OR:
17541 return performOrCombine(N, DCI);
17542 case ISD::FSHR: {
17544 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
17545 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17546 return matchPERM(N, DCI);
17547 }
17548 break;
17549 }
17550 case ISD::XOR:
17551 return performXorCombine(N, DCI);
17552 case ISD::ANY_EXTEND:
17553 case ISD::ZERO_EXTEND:
17554 return performZeroOrAnyExtendCombine(N, DCI);
17556 return performSignExtendInRegCombine(N, DCI);
17557 case AMDGPUISD::FP_CLASS:
17558 return performClassCombine(N, DCI);
17559 case ISD::FCANONICALIZE:
17560 return performFCanonicalizeCombine(N, DCI);
17561 case AMDGPUISD::RCP:
17562 return performRcpCombine(N, DCI);
17563 case ISD::FLDEXP:
17564 case AMDGPUISD::FRACT:
17565 case AMDGPUISD::RSQ:
17566 case AMDGPUISD::RCP_LEGACY:
17567 case AMDGPUISD::RCP_IFLAG:
17568 case AMDGPUISD::RSQ_CLAMP: {
17569 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17570 SDValue Src = N->getOperand(0);
17571 if (Src.isUndef())
17572 return Src;
17573 break;
17574 }
17575 case ISD::SINT_TO_FP:
17576 case ISD::UINT_TO_FP:
17577 return performUCharToFloatCombine(N, DCI);
17578 case ISD::FCOPYSIGN:
17579 return performFCopySignCombine(N, DCI);
17580 case AMDGPUISD::CVT_F32_UBYTE0:
17581 case AMDGPUISD::CVT_F32_UBYTE1:
17582 case AMDGPUISD::CVT_F32_UBYTE2:
17583 case AMDGPUISD::CVT_F32_UBYTE3:
17584 return performCvtF32UByteNCombine(N, DCI);
17585 case AMDGPUISD::FMED3:
17586 return performFMed3Combine(N, DCI);
17587 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17588 return performCvtPkRTZCombine(N, DCI);
17589 case AMDGPUISD::CLAMP:
17590 return performClampCombine(N, DCI);
17591 case ISD::SCALAR_TO_VECTOR: {
17592 SelectionDAG &DAG = DCI.DAG;
17593 EVT VT = N->getValueType(0);
17594
17595 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17596 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17597 SDLoc SL(N);
17598 SDValue Src = N->getOperand(0);
17599 EVT EltVT = Src.getValueType();
17600 if (EltVT != MVT::i16)
17601 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17602
17603 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17604 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17605 }
17606
17607 break;
17608 }
17610 return performExtractVectorEltCombine(N, DCI);
17612 return performInsertVectorEltCombine(N, DCI);
17613 case ISD::FP_ROUND:
17614 return performFPRoundCombine(N, DCI);
17615 case ISD::LOAD: {
17616 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17617 return Widened;
17618 [[fallthrough]];
17619 }
17620 default: {
17621 if (!DCI.isBeforeLegalize()) {
17622 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17623 return performMemSDNodeCombine(MemNode, DCI);
17624 }
17625
17626 break;
17627 }
17628 }
17629
17631}
17632
17633/// Helper function for adjustWritemask
17634static unsigned SubIdx2Lane(unsigned Idx) {
17635 switch (Idx) {
17636 default:
17637 return ~0u;
17638 case AMDGPU::sub0:
17639 return 0;
17640 case AMDGPU::sub1:
17641 return 1;
17642 case AMDGPU::sub2:
17643 return 2;
17644 case AMDGPU::sub3:
17645 return 3;
17646 case AMDGPU::sub4:
17647 return 4; // Possible with TFE/LWE
17648 }
17649}
17650
17651/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17652SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17653 SelectionDAG &DAG) const {
17654 unsigned Opcode = Node->getMachineOpcode();
17655
17656 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17657 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17658 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17659 return Node; // not implemented for D16
17660
17661 SDNode *Users[5] = {nullptr};
17662 unsigned Lane = 0;
17663 unsigned DmaskIdx =
17664 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17665 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17666 unsigned NewDmask = 0;
17667 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17668 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17669 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17670 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17671 unsigned TFCLane = 0;
17672 bool HasChain = Node->getNumValues() > 1;
17673
17674 if (OldDmask == 0) {
17675 // These are folded out, but on the chance it happens don't assert.
17676 return Node;
17677 }
17678
17679 unsigned OldBitsSet = llvm::popcount(OldDmask);
17680 // Work out which is the TFE/LWE lane if that is enabled.
17681 if (UsesTFC) {
17682 TFCLane = OldBitsSet;
17683 }
17684
17685 // Try to figure out the used register components
17686 for (SDUse &Use : Node->uses()) {
17687
17688 // Don't look at users of the chain.
17689 if (Use.getResNo() != 0)
17690 continue;
17691
17692 SDNode *User = Use.getUser();
17693
17694 // Abort if we can't understand the usage
17695 if (!User->isMachineOpcode() ||
17696 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17697 return Node;
17698
17699 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17700 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17701 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17702 // set, etc.
17703 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17704 if (Lane == ~0u)
17705 return Node;
17706
17707 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17708 if (UsesTFC && Lane == TFCLane) {
17709 Users[Lane] = User;
17710 } else {
17711 // Set which texture component corresponds to the lane.
17712 unsigned Comp;
17713 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17714 Comp = llvm::countr_zero(Dmask);
17715 Dmask &= ~(1 << Comp);
17716 }
17717
17718 // Abort if we have more than one user per component.
17719 if (Users[Lane])
17720 return Node;
17721
17722 Users[Lane] = User;
17723 NewDmask |= 1 << Comp;
17724 }
17725 }
17726
17727 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17728 bool NoChannels = !NewDmask;
17729 if (NoChannels) {
17730 if (!UsesTFC) {
17731 // No uses of the result and not using TFC. Then do nothing.
17732 return Node;
17733 }
17734 // If the original dmask has one channel - then nothing to do
17735 if (OldBitsSet == 1)
17736 return Node;
17737 // Use an arbitrary dmask - required for the instruction to work
17738 NewDmask = 1;
17739 }
17740 // Abort if there's no change
17741 if (NewDmask == OldDmask)
17742 return Node;
17743
17744 unsigned BitsSet = llvm::popcount(NewDmask);
17745
17746 // Check for TFE or LWE - increase the number of channels by one to account
17747 // for the extra return value
17748 // This will need adjustment for D16 if this is also included in
17749 // adjustWriteMask (this function) but at present D16 are excluded.
17750 unsigned NewChannels = BitsSet + UsesTFC;
17751
17752 int NewOpcode =
17753 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17754 assert(NewOpcode != -1 &&
17755 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17756 "failed to find equivalent MIMG op");
17757
17758 // Adjust the writemask in the node
17760 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17761 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17762 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17763
17764 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17765
17766 MVT ResultVT = NewChannels == 1
17767 ? SVT
17768 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17769 : NewChannels == 5 ? 8
17770 : NewChannels);
17771 SDVTList NewVTList =
17772 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17773
17774 MachineSDNode *NewNode =
17775 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17776
17777 if (HasChain) {
17778 // Update chain.
17779 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17780 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17781 }
17782
17783 if (NewChannels == 1) {
17784 assert(Node->hasNUsesOfValue(1, 0));
17785 SDNode *Copy =
17786 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17787 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17788 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17789 return nullptr;
17790 }
17791
17792 // Update the users of the node with the new indices
17793 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17794 SDNode *User = Users[i];
17795 if (!User) {
17796 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17797 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17798 if (i || !NoChannels)
17799 continue;
17800 } else {
17801 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17802 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17803 if (NewUser != User) {
17804 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17805 DAG.RemoveDeadNode(User);
17806 }
17807 }
17808
17809 switch (Idx) {
17810 default:
17811 break;
17812 case AMDGPU::sub0:
17813 Idx = AMDGPU::sub1;
17814 break;
17815 case AMDGPU::sub1:
17816 Idx = AMDGPU::sub2;
17817 break;
17818 case AMDGPU::sub2:
17819 Idx = AMDGPU::sub3;
17820 break;
17821 case AMDGPU::sub3:
17822 Idx = AMDGPU::sub4;
17823 break;
17824 }
17825 }
17826
17827 DAG.RemoveDeadNode(Node);
17828 return nullptr;
17829}
17830
17832 if (Op.getOpcode() == ISD::AssertZext)
17833 Op = Op.getOperand(0);
17834
17835 return isa<FrameIndexSDNode>(Op);
17836}
17837
17838/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17839/// with frame index operands.
17840/// LLVM assumes that inputs are to these instructions are registers.
17841SDNode *
17843 SelectionDAG &DAG) const {
17844 if (Node->getOpcode() == ISD::CopyToReg) {
17845 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17846 SDValue SrcVal = Node->getOperand(2);
17847
17848 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17849 // to try understanding copies to physical registers.
17850 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17851 SDLoc SL(Node);
17853 SDValue VReg = DAG.getRegister(
17854 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17855
17856 SDNode *Glued = Node->getGluedNode();
17857 SDValue ToVReg = DAG.getCopyToReg(
17858 Node->getOperand(0), SL, VReg, SrcVal,
17859 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17860 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17861 VReg, ToVReg.getValue(1));
17862 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17863 DAG.RemoveDeadNode(Node);
17864 return ToResultReg.getNode();
17865 }
17866 }
17867
17869 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17870 if (!isFrameIndexOp(Node->getOperand(i))) {
17871 Ops.push_back(Node->getOperand(i));
17872 continue;
17873 }
17874
17875 SDLoc DL(Node);
17876 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17877 Node->getOperand(i).getValueType(),
17878 Node->getOperand(i)),
17879 0));
17880 }
17881
17882 return DAG.UpdateNodeOperands(Node, Ops);
17883}
17884
17885/// Fold the instructions after selecting them.
17886/// Returns null if users were already updated.
17888 SelectionDAG &DAG) const {
17890 unsigned Opcode = Node->getMachineOpcode();
17891
17892 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17893 !TII->isGather4(Opcode) &&
17894 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17895 return adjustWritemask(Node, DAG);
17896 }
17897
17898 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17900 return Node;
17901 }
17902
17903 switch (Opcode) {
17904 case AMDGPU::V_DIV_SCALE_F32_e64:
17905 case AMDGPU::V_DIV_SCALE_F64_e64: {
17906 // Satisfy the operand register constraint when one of the inputs is
17907 // undefined. Ordinarily each undef value will have its own implicit_def of
17908 // a vreg, so force these to use a single register.
17909 SDValue Src0 = Node->getOperand(1);
17910 SDValue Src1 = Node->getOperand(3);
17911 SDValue Src2 = Node->getOperand(5);
17912
17913 if ((Src0.isMachineOpcode() &&
17914 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17915 (Src0 == Src1 || Src0 == Src2))
17916 break;
17917
17918 MVT VT = Src0.getValueType().getSimpleVT();
17919 const TargetRegisterClass *RC =
17920 getRegClassFor(VT, Src0.getNode()->isDivergent());
17921
17923 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17924
17925 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17926 Src0, SDValue());
17927
17928 // src0 must be the same register as src1 or src2, even if the value is
17929 // undefined, so make sure we don't violate this constraint.
17930 if (Src0.isMachineOpcode() &&
17931 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17932 if (Src1.isMachineOpcode() &&
17933 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17934 Src0 = Src1;
17935 else if (Src2.isMachineOpcode() &&
17936 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17937 Src0 = Src2;
17938 else {
17939 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17940 Src0 = UndefReg;
17941 Src1 = UndefReg;
17942 }
17943 } else
17944 break;
17945
17947 Ops[1] = Src0;
17948 Ops[3] = Src1;
17949 Ops[5] = Src2;
17950 Ops.push_back(ImpDef.getValue(1));
17951 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17952 }
17953 default:
17954 break;
17955 }
17956
17957 return Node;
17958}
17959
17960// Any MIMG instructions that use tfe or lwe require an initialization of the
17961// result register that will be written in the case of a memory access failure.
17962// The required code is also added to tie this init code to the result of the
17963// img instruction.
17966 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17967 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17968 MachineBasicBlock &MBB = *MI.getParent();
17969
17970 int DstIdx =
17971 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17972 unsigned InitIdx = 0;
17973
17974 if (TII->isImage(MI)) {
17975 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17976 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17977 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17978
17979 if (!TFE && !LWE) // intersect_ray
17980 return;
17981
17982 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17983 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17984 unsigned D16Val = D16 ? D16->getImm() : 0;
17985
17986 if (!TFEVal && !LWEVal)
17987 return;
17988
17989 // At least one of TFE or LWE are non-zero
17990 // We have to insert a suitable initialization of the result value and
17991 // tie this to the dest of the image instruction.
17992
17993 // Calculate which dword we have to initialize to 0.
17994 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17995
17996 // check that dmask operand is found.
17997 assert(MO_Dmask && "Expected dmask operand in instruction");
17998
17999 unsigned dmask = MO_Dmask->getImm();
18000 // Determine the number of active lanes taking into account the
18001 // Gather4 special case
18002 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
18003
18004 bool Packed = !Subtarget->hasUnpackedD16VMem();
18005
18006 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18007
18008 // Abandon attempt if the dst size isn't large enough
18009 // - this is in fact an error but this is picked up elsewhere and
18010 // reported correctly.
18011 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
18012
18013 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
18014 if (DstSize < InitIdx)
18015 return;
18016 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
18017 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
18018 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
18019 } else {
18020 return;
18021 }
18022
18023 const DebugLoc &DL = MI.getDebugLoc();
18024
18025 // Create a register for the initialization value.
18026 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
18027 unsigned NewDst = 0; // Final initialized value will be in here
18028
18029 // If PRTStrictNull feature is enabled (the default) then initialize
18030 // all the result registers to 0, otherwise just the error indication
18031 // register (VGPRn+1)
18032 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18033 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18034
18035 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
18036 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18037 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
18038 // Initialize dword
18039 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
18040 // clang-format off
18041 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
18042 .addImm(0);
18043 // clang-format on
18044 // Insert into the super-reg
18045 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
18046 .addReg(PrevDst)
18047 .addReg(SubReg)
18049
18050 PrevDst = NewDst;
18051 }
18052
18053 // Add as an implicit operand
18054 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
18055
18056 // Tie the just added implicit operand to the dst
18057 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
18058}
18059
18060/// Assign the register class depending on the number of
18061/// bits set in the writemask
18063 SDNode *Node) const {
18065
18066 MachineFunction *MF = MI.getMF();
18068
18069 if (TII->isVOP3(MI.getOpcode())) {
18070 // Make sure constant bus requirements are respected.
18071 TII->legalizeOperandsVOP3(MRI, MI);
18072
18073 if (TII->isMAI(MI)) {
18074 // The ordinary src0, src1, src2 were legalized above.
18075 //
18076 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
18077 // as a separate instruction.
18078 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
18079 AMDGPU::OpName::scale_src0);
18080 if (Src0Idx != -1) {
18081 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
18082 AMDGPU::OpName::scale_src1);
18083 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
18084 TII->usesConstantBus(MRI, MI, Src1Idx))
18085 TII->legalizeOpWithMove(MI, Src1Idx);
18086 }
18087 }
18088
18089 return;
18090 }
18091
18092 if (TII->isImage(MI))
18093 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
18094}
18095
18097 uint64_t Val) {
18098 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
18099 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
18100}
18101
18103 const SDLoc &DL,
18104 SDValue Ptr) const {
18106
18107 // Build the half of the subregister with the constants before building the
18108 // full 128-bit register. If we are building multiple resource descriptors,
18109 // this will allow CSEing of the 2-component register.
18110 const SDValue Ops0[] = {
18111 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
18112 buildSMovImm32(DAG, DL, 0),
18113 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
18114 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
18115 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
18116
18117 SDValue SubRegHi = SDValue(
18118 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
18119
18120 // Combine the constants and the pointer.
18121 const SDValue Ops1[] = {
18122 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
18123 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
18124 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
18125
18126 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
18127}
18128
18129/// Return a resource descriptor with the 'Add TID' bit enabled
18130/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
18131/// of the resource descriptor) to create an offset, which is added to
18132/// the resource pointer.
18134 SDValue Ptr, uint32_t RsrcDword1,
18135 uint64_t RsrcDword2And3) const {
18136 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
18137 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
18138 if (RsrcDword1) {
18139 PtrHi =
18140 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
18141 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
18142 0);
18143 }
18144
18145 SDValue DataLo =
18146 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
18147 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
18148
18149 const SDValue Ops[] = {
18150 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
18151 PtrLo,
18152 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
18153 PtrHi,
18154 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
18155 DataLo,
18156 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
18157 DataHi,
18158 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
18159
18160 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
18161}
18162
18163//===----------------------------------------------------------------------===//
18164// SI Inline Assembly Support
18165//===----------------------------------------------------------------------===//
18166
18167std::pair<unsigned, const TargetRegisterClass *>
18169 StringRef Constraint,
18170 MVT VT) const {
18171 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
18172
18173 const TargetRegisterClass *RC = nullptr;
18174 if (Constraint.size() == 1) {
18175 // Check if we cannot determine the bit size of the given value type. This
18176 // can happen, for example, in this situation where we have an empty struct
18177 // (size 0): `call void asm "", "v"({} poison)`-
18178 if (VT == MVT::Other)
18179 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18180 const unsigned BitWidth = VT.getSizeInBits();
18181 switch (Constraint[0]) {
18182 default:
18183 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18184 case 's':
18185 case 'r':
18186 switch (BitWidth) {
18187 case 16:
18188 RC = &AMDGPU::SReg_32RegClass;
18189 break;
18190 case 64:
18191 RC = &AMDGPU::SGPR_64RegClass;
18192 break;
18193 default:
18195 if (!RC)
18196 return std::pair(0U, nullptr);
18197 break;
18198 }
18199 break;
18200 case 'v':
18201 switch (BitWidth) {
18202 case 1:
18203 return std::pair(0U, nullptr);
18204 case 16:
18205 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18206 : &AMDGPU::VGPR_32_Lo256RegClass;
18207 break;
18208 default:
18209 RC = Subtarget->has1024AddressableVGPRs()
18210 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
18211 : TRI->getVGPRClassForBitWidth(BitWidth);
18212 if (!RC)
18213 return std::pair(0U, nullptr);
18214 break;
18215 }
18216 break;
18217 case 'a':
18218 if (!Subtarget->hasMAIInsts())
18219 break;
18220 switch (BitWidth) {
18221 case 1:
18222 return std::pair(0U, nullptr);
18223 case 16:
18224 RC = &AMDGPU::AGPR_32RegClass;
18225 break;
18226 default:
18227 RC = TRI->getAGPRClassForBitWidth(BitWidth);
18228 if (!RC)
18229 return std::pair(0U, nullptr);
18230 break;
18231 }
18232 break;
18233 }
18234 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
18235 const unsigned BitWidth = VT.getSizeInBits();
18236 switch (BitWidth) {
18237 case 16:
18238 RC = &AMDGPU::AV_32RegClass;
18239 break;
18240 default:
18241 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
18242 if (!RC)
18243 return std::pair(0U, nullptr);
18244 break;
18245 }
18246 }
18247
18248 // We actually support i128, i16 and f16 as inline parameters
18249 // even if they are not reported as legal
18250 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
18251 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
18252 return std::pair(0U, RC);
18253
18254 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
18255 if (Kind != '\0') {
18256 if (Kind == 'v') {
18257 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18258 } else if (Kind == 's') {
18259 RC = &AMDGPU::SGPR_32RegClass;
18260 } else if (Kind == 'a') {
18261 RC = &AMDGPU::AGPR_32RegClass;
18262 }
18263
18264 if (RC) {
18265 if (NumRegs > 1) {
18266 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
18267 return std::pair(0U, nullptr);
18268
18269 uint32_t Width = NumRegs * 32;
18270 // Prohibit constraints for register ranges with a width that does not
18271 // match the required type.
18272 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
18273 return std::pair(0U, nullptr);
18274
18275 MCRegister Reg = RC->getRegister(Idx);
18277 RC = TRI->getVGPRClassForBitWidth(Width);
18278 else if (SIRegisterInfo::isSGPRClass(RC))
18279 RC = TRI->getSGPRClassForBitWidth(Width);
18280 else if (SIRegisterInfo::isAGPRClass(RC))
18281 RC = TRI->getAGPRClassForBitWidth(Width);
18282 if (RC) {
18283 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
18284 if (!Reg) {
18285 // The register class does not contain the requested register,
18286 // e.g., because it is an SGPR pair that would violate alignment
18287 // requirements.
18288 return std::pair(0U, nullptr);
18289 }
18290 return std::pair(Reg, RC);
18291 }
18292 }
18293
18294 // Check for lossy scalar/vector conversions.
18295 if (VT.isVector() && VT.getSizeInBits() != 32)
18296 return std::pair(0U, nullptr);
18297 if (Idx < RC->getNumRegs())
18298 return std::pair(RC->getRegister(Idx), RC);
18299 return std::pair(0U, nullptr);
18300 }
18301 }
18302
18303 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18304 if (Ret.first)
18305 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
18306
18307 return Ret;
18308}
18309
18310static bool isImmConstraint(StringRef Constraint) {
18311 if (Constraint.size() == 1) {
18312 switch (Constraint[0]) {
18313 default:
18314 break;
18315 case 'I':
18316 case 'J':
18317 case 'A':
18318 case 'B':
18319 case 'C':
18320 return true;
18321 }
18322 } else if (Constraint == "DA" || Constraint == "DB") {
18323 return true;
18324 }
18325 return false;
18326}
18327
18330 if (Constraint.size() == 1) {
18331 switch (Constraint[0]) {
18332 default:
18333 break;
18334 case 's':
18335 case 'v':
18336 case 'a':
18337 return C_RegisterClass;
18338 }
18339 } else if (Constraint.size() == 2) {
18340 if (Constraint == "VA")
18341 return C_RegisterClass;
18342 }
18343 if (isImmConstraint(Constraint)) {
18344 return C_Other;
18345 }
18346 return TargetLowering::getConstraintType(Constraint);
18347}
18348
18349static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
18351 Val = Val & maskTrailingOnes<uint64_t>(Size);
18352 }
18353 return Val;
18354}
18355
18357 StringRef Constraint,
18358 std::vector<SDValue> &Ops,
18359 SelectionDAG &DAG) const {
18360 if (isImmConstraint(Constraint)) {
18361 uint64_t Val;
18362 if (getAsmOperandConstVal(Op, Val) &&
18363 checkAsmConstraintVal(Op, Constraint, Val)) {
18364 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
18365 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
18366 }
18367 } else {
18369 }
18370}
18371
18373 unsigned Size = Op.getScalarValueSizeInBits();
18374 if (Size > 64)
18375 return false;
18376
18377 if (Size == 16 && !Subtarget->has16BitInsts())
18378 return false;
18379
18381 Val = C->getSExtValue();
18382 return true;
18383 }
18385 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18386 return true;
18387 }
18389 if (Size != 16 || Op.getNumOperands() != 2)
18390 return false;
18391 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
18392 return false;
18393 if (ConstantSDNode *C = V->getConstantSplatNode()) {
18394 Val = C->getSExtValue();
18395 return true;
18396 }
18397 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
18398 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18399 return true;
18400 }
18401 }
18402
18403 return false;
18404}
18405
18407 uint64_t Val) const {
18408 if (Constraint.size() == 1) {
18409 switch (Constraint[0]) {
18410 case 'I':
18412 case 'J':
18413 return isInt<16>(Val);
18414 case 'A':
18415 return checkAsmConstraintValA(Op, Val);
18416 case 'B':
18417 return isInt<32>(Val);
18418 case 'C':
18419 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
18421 default:
18422 break;
18423 }
18424 } else if (Constraint.size() == 2) {
18425 if (Constraint == "DA") {
18426 int64_t HiBits = static_cast<int32_t>(Val >> 32);
18427 int64_t LoBits = static_cast<int32_t>(Val);
18428 return checkAsmConstraintValA(Op, HiBits, 32) &&
18429 checkAsmConstraintValA(Op, LoBits, 32);
18430 }
18431 if (Constraint == "DB") {
18432 return true;
18433 }
18434 }
18435 llvm_unreachable("Invalid asm constraint");
18436}
18437
18439 unsigned MaxSize) const {
18440 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
18441 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18442 if (Size == 16) {
18443 MVT VT = Op.getSimpleValueType();
18444 switch (VT.SimpleTy) {
18445 default:
18446 return false;
18447 case MVT::i16:
18448 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
18449 case MVT::f16:
18450 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
18451 case MVT::bf16:
18452 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
18453 case MVT::v2i16:
18454 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
18455 case MVT::v2f16:
18456 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
18457 case MVT::v2bf16:
18458 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
18459 }
18460 }
18461 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
18462 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
18463 return true;
18464 return false;
18465}
18466
18467static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18468 switch (UnalignedClassID) {
18469 case AMDGPU::VReg_64RegClassID:
18470 return AMDGPU::VReg_64_Align2RegClassID;
18471 case AMDGPU::VReg_96RegClassID:
18472 return AMDGPU::VReg_96_Align2RegClassID;
18473 case AMDGPU::VReg_128RegClassID:
18474 return AMDGPU::VReg_128_Align2RegClassID;
18475 case AMDGPU::VReg_160RegClassID:
18476 return AMDGPU::VReg_160_Align2RegClassID;
18477 case AMDGPU::VReg_192RegClassID:
18478 return AMDGPU::VReg_192_Align2RegClassID;
18479 case AMDGPU::VReg_224RegClassID:
18480 return AMDGPU::VReg_224_Align2RegClassID;
18481 case AMDGPU::VReg_256RegClassID:
18482 return AMDGPU::VReg_256_Align2RegClassID;
18483 case AMDGPU::VReg_288RegClassID:
18484 return AMDGPU::VReg_288_Align2RegClassID;
18485 case AMDGPU::VReg_320RegClassID:
18486 return AMDGPU::VReg_320_Align2RegClassID;
18487 case AMDGPU::VReg_352RegClassID:
18488 return AMDGPU::VReg_352_Align2RegClassID;
18489 case AMDGPU::VReg_384RegClassID:
18490 return AMDGPU::VReg_384_Align2RegClassID;
18491 case AMDGPU::VReg_512RegClassID:
18492 return AMDGPU::VReg_512_Align2RegClassID;
18493 case AMDGPU::VReg_1024RegClassID:
18494 return AMDGPU::VReg_1024_Align2RegClassID;
18495 case AMDGPU::AReg_64RegClassID:
18496 return AMDGPU::AReg_64_Align2RegClassID;
18497 case AMDGPU::AReg_96RegClassID:
18498 return AMDGPU::AReg_96_Align2RegClassID;
18499 case AMDGPU::AReg_128RegClassID:
18500 return AMDGPU::AReg_128_Align2RegClassID;
18501 case AMDGPU::AReg_160RegClassID:
18502 return AMDGPU::AReg_160_Align2RegClassID;
18503 case AMDGPU::AReg_192RegClassID:
18504 return AMDGPU::AReg_192_Align2RegClassID;
18505 case AMDGPU::AReg_256RegClassID:
18506 return AMDGPU::AReg_256_Align2RegClassID;
18507 case AMDGPU::AReg_512RegClassID:
18508 return AMDGPU::AReg_512_Align2RegClassID;
18509 case AMDGPU::AReg_1024RegClassID:
18510 return AMDGPU::AReg_1024_Align2RegClassID;
18511 default:
18512 return -1;
18513 }
18514}
18515
18516// Figure out which registers should be reserved for stack access. Only after
18517// the function is legalized do we know all of the non-spill stack objects or if
18518// calls are present.
18522 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18523 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18524 const SIInstrInfo *TII = ST.getInstrInfo();
18525
18526 if (Info->isEntryFunction()) {
18527 // Callable functions have fixed registers used for stack access.
18529 }
18530
18531 // TODO: Move this logic to getReservedRegs()
18532 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18533 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18534 Register SReg = ST.isWave32()
18535 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18536 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18537 &AMDGPU::SGPR_64RegClass);
18538 Info->setSGPRForEXECCopy(SReg);
18539
18540 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18541 Info->getStackPtrOffsetReg()));
18542 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18543 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18544
18545 // We need to worry about replacing the default register with itself in case
18546 // of MIR testcases missing the MFI.
18547 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18548 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18549
18550 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18551 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18552
18553 Info->limitOccupancy(MF);
18554
18555 if (ST.isWave32() && !MF.empty()) {
18556 for (auto &MBB : MF) {
18557 for (auto &MI : MBB) {
18558 TII->fixImplicitOperands(MI);
18559 }
18560 }
18561 }
18562
18563 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18564 // classes if required. Ideally the register class constraints would differ
18565 // per-subtarget, but there's no easy way to achieve that right now. This is
18566 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18567 // from using them as the register class for legal types.
18568 if (ST.needsAlignedVGPRs()) {
18569 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18570 const Register Reg = Register::index2VirtReg(I);
18571 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18572 if (!RC)
18573 continue;
18574 int NewClassID = getAlignedAGPRClassID(RC->getID());
18575 if (NewClassID != -1)
18576 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18577 }
18578 }
18579
18581}
18582
18584 KnownBits &Known,
18585 const APInt &DemandedElts,
18586 const SelectionDAG &DAG,
18587 unsigned Depth) const {
18588 Known.resetAll();
18589 unsigned Opc = Op.getOpcode();
18590 switch (Opc) {
18592 unsigned IID = Op.getConstantOperandVal(0);
18593 switch (IID) {
18594 case Intrinsic::amdgcn_mbcnt_lo:
18595 case Intrinsic::amdgcn_mbcnt_hi: {
18596 const GCNSubtarget &ST =
18598 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18599 // most 31 + src1.
18600 Known.Zero.setBitsFrom(
18601 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18602 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18603 Known = KnownBits::add(Known, Known2);
18604 return;
18605 }
18606 }
18607 break;
18608 }
18609 }
18611 Op, Known, DemandedElts, DAG, Depth);
18612}
18613
18615 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18617
18618 // Set the high bits to zero based on the maximum allowed scratch size per
18619 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18620 // calculation won't overflow, so assume the sign bit is never set.
18621 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18622}
18623
18625 GISelValueTracking &VT, KnownBits &Known,
18626 unsigned Dim) {
18627 unsigned MaxValue =
18628 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18629 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18630}
18631
18633 KnownBits &Known, const APInt &DemandedElts,
18634 unsigned BFEWidth, bool SExt, unsigned Depth) {
18636 const MachineOperand &Src1 = MI.getOperand(2);
18637
18638 unsigned Src1Cst = 0;
18639 if (Src1.isImm()) {
18640 Src1Cst = Src1.getImm();
18641 } else if (Src1.isReg()) {
18642 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18643 if (!Cst)
18644 return;
18645 Src1Cst = Cst->Value.getZExtValue();
18646 } else {
18647 return;
18648 }
18649
18650 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18651 // Width is always [22:16].
18652 const unsigned Offset =
18653 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18654 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18655
18656 if (Width >= BFEWidth) // Ill-formed.
18657 return;
18658
18659 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18660 Depth + 1);
18661
18662 Known = Known.extractBits(Width, Offset);
18663
18664 if (SExt)
18665 Known = Known.sext(BFEWidth);
18666 else
18667 Known = Known.zext(BFEWidth);
18668}
18669
18671 GISelValueTracking &VT, Register R, KnownBits &Known,
18672 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18673 unsigned Depth) const {
18674 Known.resetAll();
18675 const MachineInstr *MI = MRI.getVRegDef(R);
18676 switch (MI->getOpcode()) {
18677 case AMDGPU::S_BFE_I32:
18678 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18679 /*SExt=*/true, Depth);
18680 case AMDGPU::S_BFE_U32:
18681 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18682 /*SExt=*/false, Depth);
18683 case AMDGPU::S_BFE_I64:
18684 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18685 /*SExt=*/true, Depth);
18686 case AMDGPU::S_BFE_U64:
18687 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18688 /*SExt=*/false, Depth);
18689 case AMDGPU::G_INTRINSIC:
18690 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18691 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18692 switch (IID) {
18693 case Intrinsic::amdgcn_workitem_id_x:
18694 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18695 break;
18696 case Intrinsic::amdgcn_workitem_id_y:
18697 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18698 break;
18699 case Intrinsic::amdgcn_workitem_id_z:
18700 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18701 break;
18702 case Intrinsic::amdgcn_mbcnt_lo:
18703 case Intrinsic::amdgcn_mbcnt_hi: {
18704 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18705 // most 31 + src1.
18706 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18707 ? getSubtarget()->getWavefrontSizeLog2()
18708 : 5);
18709 KnownBits Known2;
18710 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18711 Depth + 1);
18712 Known = KnownBits::add(Known, Known2);
18713 break;
18714 }
18715 case Intrinsic::amdgcn_groupstaticsize: {
18716 // We can report everything over the maximum size as 0. We can't report
18717 // based on the actual size because we don't know if it's accurate or not
18718 // at any given point.
18719 Known.Zero.setHighBits(
18720 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18721 break;
18722 }
18723 }
18724 break;
18725 }
18726 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18727 Known.Zero.setHighBits(24);
18728 break;
18729 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18730 Known.Zero.setHighBits(16);
18731 break;
18732 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
18733 // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
18734 // producing exactly 0 or 1.
18735 Known.Zero.setHighBits(Known.getBitWidth() - 1);
18736 break;
18737 case AMDGPU::G_AMDGPU_SMED3:
18738 case AMDGPU::G_AMDGPU_UMED3: {
18739 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18740
18741 KnownBits Known2;
18742 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18743 if (Known2.isUnknown())
18744 break;
18745
18746 KnownBits Known1;
18747 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18748 if (Known1.isUnknown())
18749 break;
18750
18751 KnownBits Known0;
18752 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18753 if (Known0.isUnknown())
18754 break;
18755
18756 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18757 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18758 Known.One = Known0.One & Known1.One & Known2.One;
18759 break;
18760 }
18761 }
18762}
18763
18766 unsigned Depth) const {
18767 const MachineInstr *MI = MRI.getVRegDef(R);
18768 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18769 // FIXME: Can this move to generic code? What about the case where the call
18770 // site specifies a lower alignment?
18771 Intrinsic::ID IID = GI->getIntrinsicID();
18773 AttributeList Attrs =
18774 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18775 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18776 return *RetAlign;
18777 }
18778 return Align(1);
18779}
18780
18783 const Align CacheLineAlign = Align(64);
18784
18785 // GFX950: Prevent an 8-byte instruction at loop header from being split by
18786 // the 32-byte instruction fetch window boundary. This avoids a significant
18787 // fetch delay after backward branch. We use 32-byte alignment with max
18788 // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
18789 if (ML && !DisableLoopAlignment &&
18790 getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
18791 const MachineBasicBlock *Header = ML->getHeader();
18792 // Respect user-specified or previously set alignment.
18793 if (Header->getAlignment() != PrefAlign)
18794 return Header->getAlignment();
18795 if (needsFetchWindowAlignment(*Header))
18796 return Align(32);
18797 }
18798
18799 // Pre-GFX10 target did not benefit from loop alignment
18800 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18801 getSubtarget()->hasInstFwdPrefetchBug())
18802 return PrefAlign;
18803
18804 // On GFX10 I$ is 4 x 64 bytes cache lines.
18805 // By default prefetcher keeps one cache line behind and reads two ahead.
18806 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18807 // behind and one ahead.
18808 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18809 // If loop fits 64 bytes it always spans no more than two cache lines and
18810 // does not need an alignment.
18811 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18812 // Else if loop is less or equal 192 bytes we need two lines behind.
18813
18815 const MachineBasicBlock *Header = ML->getHeader();
18816 if (Header->getAlignment() != PrefAlign)
18817 return Header->getAlignment(); // Already processed.
18818
18819 unsigned LoopSize = 0;
18820 for (const MachineBasicBlock *MBB : ML->blocks()) {
18821 // If inner loop block is aligned assume in average half of the alignment
18822 // size to be added as nops.
18823 if (MBB != Header)
18824 LoopSize += MBB->getAlignment().value() / 2;
18825
18826 for (const MachineInstr &MI : *MBB) {
18827 LoopSize += TII->getInstSizeInBytes(MI);
18828 if (LoopSize > 192)
18829 return PrefAlign;
18830 }
18831 }
18832
18833 if (LoopSize <= 64)
18834 return PrefAlign;
18835
18836 if (LoopSize <= 128)
18837 return CacheLineAlign;
18838
18839 // If any of parent loops is surrounded by prefetch instructions do not
18840 // insert new for inner loop, which would reset parent's settings.
18841 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18842 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18843 auto I = Exit->getFirstNonDebugInstr();
18844 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18845 return CacheLineAlign;
18846 }
18847 }
18848
18849 MachineBasicBlock *Pre = ML->getLoopPreheader();
18850 MachineBasicBlock *Exit = ML->getExitBlock();
18851
18852 if (Pre && Exit) {
18853 auto PreTerm = Pre->getFirstTerminator();
18854 if (PreTerm == Pre->begin() ||
18855 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18856 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18857 .addImm(1); // prefetch 2 lines behind PC
18858
18859 auto ExitHead = Exit->getFirstNonDebugInstr();
18860 if (ExitHead == Exit->end() ||
18861 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18862 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18863 .addImm(2); // prefetch 1 line behind PC
18864 }
18865
18866 return CacheLineAlign;
18867}
18868
18870 MachineBasicBlock *MBB) const {
18871 // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
18872 // instruction could be split by the 32-byte fetch window boundary.
18873 // See getPrefLoopAlignment() for context.
18874 if (needsFetchWindowAlignment(*MBB))
18875 return 4;
18877}
18878
18879bool SITargetLowering::needsFetchWindowAlignment(
18880 const MachineBasicBlock &MBB) const {
18881 if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
18882 return false;
18884 for (const MachineInstr &MI : MBB) {
18885 if (MI.isMetaInstruction())
18886 continue;
18887 // Instructions larger than 4 bytes can be split by a 32-byte boundary.
18888 return TII->getInstSizeInBytes(MI) > 4;
18889 }
18890 return false;
18891}
18892
18893[[maybe_unused]]
18894static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18895 assert(N->getOpcode() == ISD::CopyFromReg);
18896 do {
18897 // Follow the chain until we find an INLINEASM node.
18898 N = N->getOperand(0).getNode();
18899 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18900 return true;
18901 } while (N->getOpcode() == ISD::CopyFromReg);
18902 return false;
18903}
18904
18907 UniformityInfo *UA) const {
18908 switch (N->getOpcode()) {
18909 case ISD::CopyFromReg: {
18910 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18911 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18912 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18913 Register Reg = R->getReg();
18914
18915 // FIXME: Why does this need to consider isLiveIn?
18916 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18917 return !TRI->isSGPRReg(MRI, Reg);
18918
18919 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18920 return UA->isDivergent(V);
18921
18923 return !TRI->isSGPRReg(MRI, Reg);
18924 }
18925 case ISD::LOAD: {
18926 const LoadSDNode *L = cast<LoadSDNode>(N);
18927 unsigned AS = L->getAddressSpace();
18928 // A flat load may access private memory.
18930 }
18931 case ISD::CALLSEQ_END:
18932 return true;
18934 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18936 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18937 case AMDGPUISD::ATOMIC_CMP_SWAP:
18938 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18939 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18940 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18941 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18942 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18943 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18944 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18945 case AMDGPUISD::BUFFER_ATOMIC_AND:
18946 case AMDGPUISD::BUFFER_ATOMIC_OR:
18947 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18948 case AMDGPUISD::BUFFER_ATOMIC_INC:
18949 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18950 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18951 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18952 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18953 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18954 // Target-specific read-modify-write atomics are sources of divergence.
18955 return true;
18956 default:
18957 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18958 // Generic read-modify-write atomics are sources of divergence.
18959 return A->readMem() && A->writeMem();
18960 }
18961 return false;
18962 }
18963}
18964
18966 EVT VT) const {
18967 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18968 case MVT::f32:
18970 case MVT::f64:
18971 case MVT::f16:
18973 default:
18974 return false;
18975 }
18976}
18977
18979 LLT Ty, const MachineFunction &MF) const {
18980 switch (Ty.getScalarSizeInBits()) {
18981 case 32:
18982 return !denormalModeIsFlushAllF32(MF);
18983 case 64:
18984 case 16:
18985 return !denormalModeIsFlushAllF64F16(MF);
18986 default:
18987 return false;
18988 }
18989}
18990
18992 const APInt &DemandedElts,
18993 const SelectionDAG &DAG,
18994 bool SNaN,
18995 unsigned Depth) const {
18996 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18997 const MachineFunction &MF = DAG.getMachineFunction();
18999
19000 if (Info->getMode().DX10Clamp)
19001 return true; // Clamped to 0.
19002 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
19003 }
19004
19006 DAG, SNaN, Depth);
19007}
19008
19009// On older subtargets, global FP atomic instructions have a hardcoded FP mode
19010// and do not support FP32 denormals, and only support v2f16/f64 denormals.
19012 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
19013 return true;
19014
19015 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
19016 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
19017 if (DenormMode == DenormalMode::getPreserveSign())
19018 return true;
19019
19020 // TODO: Remove this.
19021 return RMW->getFunction()
19022 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
19023 .getValueAsBool();
19024}
19025
19027 LLVMContext &Ctx = RMW->getContext();
19028 StringRef MemScope =
19029 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
19030
19031 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
19032 << "Hardware instruction generated for atomic "
19033 << RMW->getOperationName(RMW->getOperation())
19034 << " operation at memory scope " << MemScope;
19035}
19036
19037static bool isV2F16OrV2BF16(Type *Ty) {
19038 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
19039 Type *EltTy = VT->getElementType();
19040 return VT->getNumElements() == 2 &&
19041 (EltTy->isHalfTy() || EltTy->isBFloatTy());
19042 }
19043
19044 return false;
19045}
19046
19047static bool isV2F16(Type *Ty) {
19049 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
19050}
19051
19052static bool isV2BF16(Type *Ty) {
19054 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
19055}
19056
19057/// \return true if atomicrmw integer ops work for the type.
19058static bool isAtomicRMWLegalIntTy(Type *Ty) {
19059 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
19060 unsigned BW = IT->getBitWidth();
19061 return BW == 32 || BW == 64;
19062 }
19063
19064 return false;
19065}
19066
19067/// \return true if this atomicrmw xchg type can be selected.
19068static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
19069 Type *Ty = RMW->getType();
19070 if (isAtomicRMWLegalIntTy(Ty))
19071 return true;
19072
19073 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
19074 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
19075 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
19076 return BW == 32 || BW == 64;
19077 }
19078
19079 if (Ty->isFloatTy() || Ty->isDoubleTy())
19080 return true;
19081
19083 return VT->getNumElements() == 2 &&
19084 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19085 }
19086
19087 return false;
19088}
19089
19090/// \returns true if it's valid to emit a native instruction for \p RMW, based
19091/// on the properties of the target memory.
19092static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
19093 const AtomicRMWInst *RMW,
19094 bool HasSystemScope) {
19095 // The remote/fine-grained access logic is different from the integer
19096 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
19097 // fine-grained access does not work, even for a device local allocation.
19098 //
19099 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
19100 // allocations work.
19101 if (HasSystemScope) {
19102 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19103 RMW->hasMetadata("amdgpu.no.remote.memory"))
19104 return true;
19105 if (Subtarget.hasEmulatedSystemScopeAtomics())
19106 return true;
19107 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19108 return true;
19109
19110 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
19111}
19112
19113/// \return Action to perform on AtomicRMWInsts for integer operations.
19120
19121/// Return if a flat address space atomicrmw can access private memory.
19123 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
19124 return !MD ||
19126}
19127
19130 // For GAS, lower to flat atomic.
19131 return STI.hasGloballyAddressableScratch()
19134}
19135
19138 unsigned AS = RMW->getPointerAddressSpace();
19139 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
19141
19142 // 64-bit flat atomics that dynamically reside in private memory will silently
19143 // be dropped.
19144 //
19145 // Note that we will emit a new copy of the original atomic in the expansion,
19146 // which will be incrementally relegalized.
19147 const DataLayout &DL = RMW->getFunction()->getDataLayout();
19148 if (AS == AMDGPUAS::FLAT_ADDRESS &&
19149 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
19152
19153 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
19155 ORE.emit([=]() {
19156 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
19157 });
19158 return Kind;
19159 };
19160
19161 auto SSID = RMW->getSyncScopeID();
19162 bool HasSystemScope =
19163 SSID == SyncScope::System ||
19164 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
19165
19166 auto Op = RMW->getOperation();
19167 switch (Op) {
19169 // PCIe supports add and xchg for system atomics.
19170 return isAtomicRMWLegalXChgTy(RMW)
19173 case AtomicRMWInst::Add:
19174 // PCIe supports add and xchg for system atomics.
19176 case AtomicRMWInst::Sub:
19177 case AtomicRMWInst::And:
19178 case AtomicRMWInst::Or:
19179 case AtomicRMWInst::Xor:
19180 case AtomicRMWInst::Max:
19181 case AtomicRMWInst::Min:
19188 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
19190 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
19193 auto *IT = dyn_cast<IntegerType>(RMW->getType());
19194 if (!IT || IT->getBitWidth() != 32)
19196 }
19197
19200 if (Subtarget->hasEmulatedSystemScopeAtomics())
19202
19203 // On most subtargets, for atomicrmw operations other than add/xchg,
19204 // whether or not the instructions will behave correctly depends on where
19205 // the address physically resides and what interconnect is used in the
19206 // system configuration. On some some targets the instruction will nop,
19207 // and in others synchronization will only occur at degraded device scope.
19208 //
19209 // If the allocation is known local to the device, the instructions should
19210 // work correctly.
19211 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
19213
19214 // If fine-grained remote memory works at device scope, we don't need to
19215 // do anything.
19216 if (!HasSystemScope &&
19217 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19219
19220 // If we are targeting a remote allocated address, it depends what kind of
19221 // allocation the address belongs to.
19222 //
19223 // If the allocation is fine-grained (in host memory, or in PCIe peer
19224 // device memory), the operation will fail depending on the target.
19225 //
19226 // Note fine-grained host memory access does work on APUs or if XGMI is
19227 // used, but we do not know if we are targeting an APU or the system
19228 // configuration from the ISA version/target-cpu.
19229 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
19231
19234 // Atomic sub/or/xor do not work over PCI express, but atomic add
19235 // does. InstCombine transforms these with 0 to or, so undo that.
19236 if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
19237 ConstVal && ConstVal->isNullValue())
19239 }
19240
19241 // If the allocation could be in remote, fine-grained memory, the rmw
19242 // instructions may fail. cmpxchg should work, so emit that. On some
19243 // system configurations, PCIe atomics aren't supported so cmpxchg won't
19244 // even work, so you're out of luck anyway.
19245
19246 // In summary:
19247 //
19248 // Cases that may fail:
19249 // - fine-grained pinned host memory
19250 // - fine-grained migratable host memory
19251 // - fine-grained PCIe peer device
19252 //
19253 // Cases that should work, but may be treated overly conservatively.
19254 // - fine-grained host memory on an APU
19255 // - fine-grained XGMI peer device
19257 }
19258
19260 }
19261 case AtomicRMWInst::FAdd: {
19262 Type *Ty = RMW->getType();
19263
19264 // TODO: Handle REGION_ADDRESS
19265 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19266 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
19267 // is fixed to round-to-nearest-even.
19268 //
19269 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
19270 // round-to-nearest-even.
19271 //
19272 // We ignore the rounding mode problem, even in strictfp. The C++ standard
19273 // suggests it is OK if the floating-point mode may not match the calling
19274 // thread.
19275 if (Ty->isFloatTy()) {
19276 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
19278 }
19279
19280 if (Ty->isDoubleTy()) {
19281 // Ignores denormal mode, but we don't consider flushing mandatory.
19282 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
19284 }
19285
19286 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19288
19290 }
19291
19292 // LDS atomics respect the denormal mode from the mode register.
19293 //
19294 // Traditionally f32 global/buffer memory atomics would unconditionally
19295 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
19296 // flush.
19297 //
19298 // On targets with flat atomic fadd, denormals would flush depending on
19299 // whether the target address resides in LDS or global memory. We consider
19300 // this flat-maybe-flush as will-flush.
19301 if (Ty->isFloatTy() &&
19302 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19305
19306 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
19307 // safe. The message phrasing also should be better.
19308 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
19309 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19310 // gfx942, gfx12
19311 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19312 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19313 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
19314 // gfx90a, gfx942, gfx12
19315 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19316 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19317
19318 // gfx942, gfx12
19319 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
19320 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19321 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19322 // gfx90a, gfx942, gfx12
19323 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19324 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19325
19326 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
19327 // buffer. gfx12 does have the buffer version.
19328 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
19329 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19330 }
19331
19332 // global and flat atomic fadd f64: gfx90a, gfx942.
19333 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19334 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19335
19336 if (AS != AMDGPUAS::FLAT_ADDRESS) {
19337 if (Ty->isFloatTy()) {
19338 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
19339 // gfx11+.
19340 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19341 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19342 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
19343 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19344 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19345 } else {
19346 // gfx908
19347 if (RMW->use_empty() &&
19348 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19349 isV2F16(Ty))
19350 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19351 }
19352 }
19353
19354 // flat atomic fadd f32: gfx942, gfx11+.
19355 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
19356 if (Subtarget->hasFlatAtomicFaddF32Inst())
19357 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19358
19359 // If it is in flat address space, and the type is float, we will try to
19360 // expand it, if the target supports global and lds atomic fadd. The
19361 // reason we need that is, in the expansion, we emit the check of
19362 // address space. If it is in global address space, we emit the global
19363 // atomic fadd; if it is in shared address space, we emit the LDS atomic
19364 // fadd.
19365 if (Subtarget->hasLDSFPAtomicAddF32()) {
19366 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19368 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19370 }
19371 }
19372 }
19373
19375 }
19377 case AtomicRMWInst::FMax: {
19378 Type *Ty = RMW->getType();
19379
19380 // LDS float and double fmin/fmax were always supported.
19381 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19382 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
19384 }
19385
19386 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
19387 // For flat and global cases:
19388 // float, double in gfx7. Manual claims denormal support.
19389 // Removed in gfx8.
19390 // float, double restored in gfx10.
19391 // double removed again in gfx11, so only f32 for gfx11/gfx12.
19392 //
19393 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
19394 // no f32.
19395 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19396 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19397 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19398 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19399 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19400 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
19402 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19403 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19404 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19405 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19406 }
19407 }
19408
19410 }
19413 default:
19415 }
19416
19417 llvm_unreachable("covered atomicrmw op switch");
19418}
19419
19426
19433
19436 const AtomicCmpXchgInst *CmpX) const {
19437 unsigned AddrSpace = CmpX->getPointerAddressSpace();
19438 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
19440
19441 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
19443
19444 const DataLayout &DL = CmpX->getDataLayout();
19445
19446 Type *ValTy = CmpX->getNewValOperand()->getType();
19447
19448 // If a 64-bit flat atomic may alias private, we need to avoid using the
19449 // atomic in the private case.
19450 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
19452}
19453
19454const TargetRegisterClass *
19455SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
19457 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19458 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19459 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19460 : &AMDGPU::SReg_32RegClass;
19461 if (!TRI->isSGPRClass(RC) && !isDivergent)
19462 return TRI->getEquivalentSGPRClass(RC);
19463 if (TRI->isSGPRClass(RC) && isDivergent) {
19464 if (Subtarget->hasGFX90AInsts())
19465 return TRI->getEquivalentAVClass(RC);
19466 return TRI->getEquivalentVGPRClass(RC);
19467 }
19468
19469 return RC;
19470}
19471
19472// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19473// uniform values (as produced by the mask results of control flow intrinsics)
19474// used outside of divergent blocks. The phi users need to also be treated as
19475// always uniform.
19476//
19477// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19478static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
19479 unsigned WaveSize) {
19480 // FIXME: We assume we never cast the mask results of a control flow
19481 // intrinsic.
19482 // Early exit if the type won't be consistent as a compile time hack.
19483 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
19484 if (!IT || IT->getBitWidth() != WaveSize)
19485 return false;
19486
19487 if (!isa<Instruction>(V))
19488 return false;
19489 if (!Visited.insert(V).second)
19490 return false;
19491 bool Result = false;
19492 for (const auto *U : V->users()) {
19494 if (V == U->getOperand(1)) {
19495 switch (Intrinsic->getIntrinsicID()) {
19496 default:
19497 Result = false;
19498 break;
19499 case Intrinsic::amdgcn_if_break:
19500 case Intrinsic::amdgcn_if:
19501 case Intrinsic::amdgcn_else:
19502 Result = true;
19503 break;
19504 }
19505 }
19506 if (V == U->getOperand(0)) {
19507 switch (Intrinsic->getIntrinsicID()) {
19508 default:
19509 Result = false;
19510 break;
19511 case Intrinsic::amdgcn_end_cf:
19512 case Intrinsic::amdgcn_loop:
19513 Result = true;
19514 break;
19515 }
19516 }
19517 } else {
19518 Result = hasCFUser(U, Visited, WaveSize);
19519 }
19520 if (Result)
19521 break;
19522 }
19523 return Result;
19524}
19525
19527 const Value *V) const {
19528 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
19529 if (CI->isInlineAsm()) {
19530 // FIXME: This cannot give a correct answer. This should only trigger in
19531 // the case where inline asm returns mixed SGPR and VGPR results, used
19532 // outside the defining block. We don't have a specific result to
19533 // consider, so this assumes if any value is SGPR, the overall register
19534 // also needs to be SGPR.
19535 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19537 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
19538 for (auto &TC : TargetConstraints) {
19539 if (TC.Type == InlineAsm::isOutput) {
19541 const TargetRegisterClass *RC =
19542 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
19543 TC.ConstraintVT)
19544 .second;
19545 if (RC && SIRI->isSGPRClass(RC))
19546 return true;
19547 }
19548 }
19549 }
19550 }
19552 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19553}
19554
19556 for (SDUse &Use : N->uses()) {
19558 if (getBasePtrIndex(M) == Use.getOperandNo())
19559 return true;
19560 }
19561 }
19562 return false;
19563}
19564
19566 SDValue N1) const {
19567 if (!N0.hasOneUse())
19568 return false;
19569 // Take care of the opportunity to keep N0 uniform
19570 if (N0->isDivergent() || !N1->isDivergent())
19571 return true;
19572 // Check if we have a good chance to form the memory access pattern with the
19573 // base and offset
19574 return (DAG.isBaseWithConstantOffset(N0) &&
19576}
19577
19579 Register N0, Register N1) const {
19580 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
19581}
19582
19585 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19587 if (I.getMetadata("amdgpu.noclobber"))
19588 Flags |= MONoClobber;
19589 if (I.getMetadata("amdgpu.last.use"))
19590 Flags |= MOLastUse;
19591 return Flags;
19592}
19593
19595 Instruction *AI) const {
19596 // Given: atomicrmw fadd ptr %addr, float %val ordering
19597 //
19598 // With this expansion we produce the following code:
19599 // [...]
19600 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19601 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19602 //
19603 // atomicrmw.shared:
19604 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19605 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19606 // float %val ordering
19607 // br label %atomicrmw.phi
19608 //
19609 // atomicrmw.check.private:
19610 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19611 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19612 //
19613 // atomicrmw.private:
19614 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19615 // %loaded.private = load float, ptr addrspace(5) %cast.private
19616 // %val.new = fadd float %loaded.private, %val
19617 // store float %val.new, ptr addrspace(5) %cast.private
19618 // br label %atomicrmw.phi
19619 //
19620 // atomicrmw.global:
19621 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19622 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19623 // float %val ordering
19624 // br label %atomicrmw.phi
19625 //
19626 // atomicrmw.phi:
19627 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19628 // [ %loaded.private, %atomicrmw.private ],
19629 // [ %loaded.global, %atomicrmw.global ]
19630 // br label %atomicrmw.end
19631 //
19632 // atomicrmw.end:
19633 // [...]
19634 //
19635 //
19636 // For 64-bit atomics which may reside in private memory, we perform a simpler
19637 // version that only inserts the private check, and uses the flat operation.
19638
19639 IRBuilder<> Builder(AI);
19640 LLVMContext &Ctx = Builder.getContext();
19641
19642 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19643 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19645 Value *Addr = AI->getOperand(PtrOpIdx);
19646
19647 /// TODO: Only need to check private, then emit flat-known-not private (no
19648 /// need for shared block, or cast to global).
19650
19651 Align Alignment;
19652 if (RMW)
19653 Alignment = RMW->getAlign();
19654 else if (CX)
19655 Alignment = CX->getAlign();
19656 else
19657 llvm_unreachable("unhandled atomic operation");
19658
19659 // FullFlatEmulation is true if we need to issue the private, shared, and
19660 // global cases.
19661 //
19662 // If this is false, we are only dealing with the flat-targeting-private case,
19663 // where we only insert a check for private and still use the flat instruction
19664 // for global and shared.
19665
19666 bool FullFlatEmulation =
19667 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19668 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19669 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19670 RMW->getType()->isDoubleTy()));
19671
19672 // If the return value isn't used, do not introduce a false use in the phi.
19673 bool ReturnValueIsUsed = !AI->use_empty();
19674
19675 BasicBlock *BB = Builder.GetInsertBlock();
19676 Function *F = BB->getParent();
19677 BasicBlock *ExitBB =
19678 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19679 BasicBlock *SharedBB = nullptr;
19680
19681 BasicBlock *CheckPrivateBB = BB;
19682 if (FullFlatEmulation) {
19683 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19684 CheckPrivateBB =
19685 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19686 }
19687
19688 BasicBlock *PrivateBB =
19689 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19690 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19691 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19692
19693 std::prev(BB->end())->eraseFromParent();
19694 Builder.SetInsertPoint(BB);
19695
19696 Value *LoadedShared = nullptr;
19697 if (FullFlatEmulation) {
19698 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19699 {Addr}, nullptr, "is.shared");
19700 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19701 Builder.SetInsertPoint(SharedBB);
19702 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19704
19705 Instruction *Clone = AI->clone();
19706 Clone->insertInto(SharedBB, SharedBB->end());
19707 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19708 LoadedShared = Clone;
19709
19710 Builder.CreateBr(PhiBB);
19711 Builder.SetInsertPoint(CheckPrivateBB);
19712 }
19713
19714 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19715 {Addr}, nullptr, "is.private");
19716 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19717
19718 Builder.SetInsertPoint(PrivateBB);
19719
19720 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19722
19723 Value *LoadedPrivate;
19724 if (RMW) {
19725 LoadedPrivate = Builder.CreateAlignedLoad(
19726 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19727
19728 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19729 LoadedPrivate, RMW->getValOperand());
19730
19731 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19732 } else {
19733 auto [ResultLoad, Equal] =
19734 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19735 CX->getNewValOperand(), CX->getAlign());
19736
19737 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19738 ResultLoad, 0);
19739 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19740 }
19741
19742 Builder.CreateBr(PhiBB);
19743
19744 Builder.SetInsertPoint(GlobalBB);
19745
19746 // Continue using a flat instruction if we only emitted the check for private.
19747 Instruction *LoadedGlobal = AI;
19748 if (FullFlatEmulation) {
19749 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19751 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19752 }
19753
19754 AI->removeFromParent();
19755 AI->insertInto(GlobalBB, GlobalBB->end());
19756
19757 // The new atomicrmw may go through another round of legalization later.
19758 if (!FullFlatEmulation) {
19759 // We inserted the runtime check already, make sure we do not try to
19760 // re-expand this.
19761 // TODO: Should union with any existing metadata.
19762 MDBuilder MDB(F->getContext());
19763 MDNode *RangeNotPrivate =
19766 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19767 RangeNotPrivate);
19768 }
19769
19770 Builder.CreateBr(PhiBB);
19771
19772 Builder.SetInsertPoint(PhiBB);
19773
19774 if (ReturnValueIsUsed) {
19775 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19776 AI->replaceAllUsesWith(Loaded);
19777 if (FullFlatEmulation)
19778 Loaded->addIncoming(LoadedShared, SharedBB);
19779 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19780 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19781 Loaded->takeName(AI);
19782 }
19783
19784 Builder.CreateBr(ExitBB);
19785}
19786
19788 unsigned PtrOpIdx) {
19789 Value *PtrOp = I->getOperand(PtrOpIdx);
19792
19793 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19794 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19795 I->getIterator());
19796 I->setOperand(PtrOpIdx, ASCast);
19797}
19798
19801
19804
19807 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19808 ConstVal && ConstVal->isNullValue()) {
19809 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19811
19812 // We may still need the private-alias-flat handling below.
19813
19814 // TODO: Skip this for cases where we cannot access remote memory.
19815 }
19816 }
19817
19818 // The non-flat expansions should only perform the de-canonicalization of
19819 // identity values.
19821 return;
19822
19824}
19825
19832
19836
19838 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19839}
19840
19842 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19843 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19844
19846 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19847}
19848
19849LoadInst *
19851 IRBuilder<> Builder(AI);
19852 auto Order = AI->getOrdering();
19853
19854 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19855 // must be flushed if the atomic ordering had a release semantics. This is
19856 // not necessary a fence, a release fence just coincides to do that flush.
19857 // Avoid replacing of an atomicrmw with a release semantics.
19858 if (isReleaseOrStronger(Order))
19859 return nullptr;
19860
19861 LoadInst *LI = Builder.CreateAlignedLoad(
19862 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19863 LI->setAtomic(Order, AI->getSyncScopeID());
19864 LI->copyMetadata(*AI);
19865 LI->takeName(AI);
19866 AI->replaceAllUsesWith(LI);
19867 AI->eraseFromParent();
19868 return LI;
19869}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1268
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1265
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1171
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5975
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1560
bool isNegative() const
Definition APFloat.h:1512
bool isNormal() const
Definition APFloat.h:1516
APInt bitcastToAPInt() const
Definition APFloat.h:1404
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1189
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1149
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1130
bool isInfinity() const
Definition APFloat.h:1509
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1406
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1400
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1654
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:338
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:215
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:764
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:805
Argument * getArg(unsigned i) const
Definition Function.h:886
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool isWave64() const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:133
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:561
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2787
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1080
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:229
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:223
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:226
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:428
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:906
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ ATOMIC_LOAD_USUB_COND
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ GlobalAddress
Definition ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ ATOMIC_LOAD_USUB_SAT
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:975
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:970
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:833
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:237
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:313
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
Definition MathExtras.h:357
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:203
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ AfterLegalizeTypes
Definition DAGCombine.h:17
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:167
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:178
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:241
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:186
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:363
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:264
static LLVM_ABI std::optional< bool > ule(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_ULE result.
static LLVM_ABI std::optional< bool > uge(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_UGE result.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
std::optional< unsigned > fallbackAddressSpace
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs