LLVM 23.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
45#include "llvm/IR/MDBuilder.h"
48#include "llvm/Support/ModRef.h"
50#include <optional>
51
52using namespace llvm;
53using namespace llvm::SDPatternMatch;
54
55#define DEBUG_TYPE "si-lower"
56
57STATISTIC(NumTailCalls, "Number of tail calls");
58
59static cl::opt<bool>
60 DisableLoopAlignment("amdgpu-disable-loop-alignment",
61 cl::desc("Do not align and prefetch loops"),
62 cl::init(false));
63
65 "amdgpu-use-divergent-register-indexing", cl::Hidden,
66 cl::desc("Use indirect register addressing for divergent indexes"),
67 cl::init(false));
68
73
78
79static unsigned findFirstFreeSGPR(CCState &CCInfo) {
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
82 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
83 return AMDGPU::SGPR0 + Reg;
84 }
85 }
86 llvm_unreachable("Cannot allocate sgpr");
87}
88
90 const GCNSubtarget &STI)
91 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
92 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
93 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
94
95 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V32RegClass =
99 TRI->getDefaultVectorSuperClassForBitWidth(32);
100 addRegisterClass(MVT::f32, V32RegClass);
101
102 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
103
104 const TargetRegisterClass *V64RegClass =
105 TRI->getDefaultVectorSuperClassForBitWidth(64);
106
107 addRegisterClass(MVT::f64, V64RegClass);
108 addRegisterClass(MVT::v2f32, V64RegClass);
109 addRegisterClass(MVT::Untyped, V64RegClass);
110
111 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
112 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
113
114 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
115 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
116
117 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
118 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
119
120 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
121 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
122
123 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
124 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
125
126 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
127 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
128
129 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
130 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
131
132 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
133 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
134
135 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
136 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
137
138 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
139 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
140
141 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
142 addRegisterClass(MVT::v10f32,
143 TRI->getDefaultVectorSuperClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32,
147 TRI->getDefaultVectorSuperClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32,
151 TRI->getDefaultVectorSuperClassForBitWidth(384));
152
153 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
154 addRegisterClass(MVT::v16f32,
155 TRI->getDefaultVectorSuperClassForBitWidth(512));
156
157 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
158 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
159
160 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
161 addRegisterClass(MVT::v16f64,
162 TRI->getDefaultVectorSuperClassForBitWidth(1024));
163
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
166 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
169 } else {
170 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
171 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
173 }
174
175 // Unless there are also VOP3P operations, not operations are really legal.
176 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
177 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
180 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
183 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
186 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
189 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
191 }
192
193 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
194 addRegisterClass(MVT::v32f32,
195 TRI->getDefaultVectorSuperClassForBitWidth(1024));
196
197 computeRegisterProperties(Subtarget->getRegisterInfo());
198
201
202 // The boolean content concept here is too inflexible. Compares only ever
203 // really produce a 1-bit result. Any copy/extend from these will turn into a
204 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
205 // it's what most targets use.
208
209 // We need to custom lower vector stores from local memory
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Custom);
216
218 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
219 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
220 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
221 MVT::i1, MVT::v32i32},
222 Custom);
223
224 if (isTypeLegal(MVT::bf16)) {
225 for (unsigned Opc :
234 ISD::SETCC}) {
235 setOperationAction(Opc, MVT::bf16, Promote);
236 }
237
239
241 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
242
246
247 // We only need to custom lower because we can't specify an action for bf16
248 // sources.
251 }
252
253 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
254 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
259 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
264 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
265 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
266 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
267 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
268 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
269
270 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
271 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
272 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
274 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
275 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
276 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
277
278 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
279 setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
280
284 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
285
286 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
287
289 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
290
292 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
293 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
294
296 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
299 Expand);
301 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
304 Expand);
305
307 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308 MVT::v3i16, MVT::v4i16, MVT::Other},
309 Custom);
310
313 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
314
316
318
320 Expand);
321
322#if 0
324#endif
325
326 // We only support LOAD/STORE and vector manipulation ops for vectors
327 // with > 4 elements.
328 for (MVT VT :
329 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
330 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
331 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
332 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
333 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
334 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
335 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
336 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
337 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
338 switch (Op) {
339 case ISD::LOAD:
340 case ISD::STORE:
342 case ISD::BITCAST:
343 case ISD::UNDEF:
347 case ISD::IS_FPCLASS:
348 break;
353 break;
354 default:
356 break;
357 }
358 }
359 }
360
362
363 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
364 // is expanded to avoid having two separate loops in case the index is a VGPR.
365
366 // Most operations are naturally 32-bit vector operations. We only support
367 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
368 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
370 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
374
376 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
377
379 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
380 }
381
382 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
384 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
388
390 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
391
393 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
394 }
395
396 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
398 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
402
404 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
405
407 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
408 }
409
410 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
412 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
416
418 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
419
421 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
422 }
423
424 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
426 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
430
432 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
433
435 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
436 }
437
439 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
440 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
441 Custom);
442
443 if (Subtarget->hasPkMovB32()) {
444 // TODO: 16-bit element vectors should be legal with even aligned elements.
445 // TODO: Can be legal with wider source types than the result with
446 // subregister extracts.
447 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
448 }
449
451 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
452 // instead lower to cndmask in SITargetLowering::LowerSELECT().
454 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
455 // alignbit.
456 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
457
458 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
459 Custom);
460
461 // Avoid stack access for these.
462 // TODO: Generalize to more vector types.
464 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
465 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
466 Custom);
467
468 // Deal with vec3 vector operations when widened to vec4.
470 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
471
472 // Deal with vec5/6/7 vector operations when widened to vec8.
474 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
475 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
476 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
477 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
478 Custom);
479
480 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
481 // and output demarshalling
482 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
483
484 // We can't return success/failure, only the old value,
485 // let LLVM add the comparison
487 Expand);
488
489 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
490
491 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
492
493 // FIXME: This should be narrowed to i32, but that only happens if i64 is
494 // illegal.
495 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
496 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
497
498 // On SI this is s_memtime and s_memrealtime on VI.
500
501 if (Subtarget->hasSMemRealTime() ||
502 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
505
506 if (Subtarget->has16BitInsts()) {
509 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
512 } else {
514 }
515
516 if (Subtarget->hasMadMacF32Insts())
518
521
522 // We only really have 32-bit BFE instructions (and 16-bit on VI).
523 //
524 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
525 // effort to match them now. We want this to be false for i64 cases when the
526 // extraction isn't restricted to the upper or lower half. Ideally we would
527 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
528 // span the midpoint are probably relatively rare, so don't worry about them
529 // for now.
531
532 // Clamp modifier on add/sub
533 if (Subtarget->hasIntClamp())
535
536 if (Subtarget->hasAddNoCarryInsts())
537 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
538 Legal);
539
542 {MVT::f32, MVT::f64}, Custom);
543
544 // These are really only legal for ieee_mode functions. We should be avoiding
545 // them for functions that don't have ieee_mode enabled, so just say they are
546 // legal.
548 {MVT::f32, MVT::f64}, Legal);
549
550 if (Subtarget->haveRoundOpsF64())
552 Legal);
553 else
555 MVT::f64, Custom);
556
558 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
559 Legal);
560 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
561
564
565 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
566 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
567
569 Custom);
571 Custom);
573 Custom);
574
575 // Custom lower these because we can't specify a rule based on an illegal
576 // source bf16.
579
580 if (Subtarget->has16BitInsts()) {
583 MVT::i16, Legal);
584
585 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
586
588 MVT::i16, Expand);
589
593 ISD::CTPOP},
594 MVT::i16, Promote);
595
597
598 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
599
601 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
603 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
604
608
610
611 // F16 - Constant Actions.
614
615 // F16 - Load/Store Actions.
617 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
619 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
620
621 // BF16 - Load/Store Actions.
623 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
625 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
626
627 // F16 - VOP1 Actions.
630 MVT::f16, Custom);
631
632 // BF16 - VOP1 Actions.
633 if (Subtarget->hasBF16TransInsts())
635
638 MVT::f16, Promote);
641 MVT::bf16, Promote);
642
643 // F16 - VOP2 Actions.
644 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
645 Expand);
649
650 // F16 - VOP3 Actions.
652 if (STI.hasMadF16())
654
655 for (MVT VT :
656 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
657 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
658 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
659 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
660 switch (Op) {
661 case ISD::LOAD:
662 case ISD::STORE:
664 case ISD::BITCAST:
665 case ISD::UNDEF:
670 case ISD::IS_FPCLASS:
671 break;
674 case ISD::FSIN:
675 case ISD::FCOS:
677 break;
678 default:
680 break;
681 }
682 }
683 }
684
685 // v_perm_b32 can handle either of these.
686 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
688
689 // XXX - Do these do anything? Vector constants turn into build_vector.
690 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
691
692 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
693 Legal);
694
696 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
698 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
699
701 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
703 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
704
705 setOperationAction(ISD::AND, MVT::v2i16, Promote);
706 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
707 setOperationAction(ISD::OR, MVT::v2i16, Promote);
708 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
709 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
710 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
711
713 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
715 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
716 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
717 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
718
720 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
722 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
724 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
725
727 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
729 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
730 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
731 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
732
734 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
736 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
737
739 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
741 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
743 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
744
745 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
746 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
747 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
748 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
749 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
750 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
751
753 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
755 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
756 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
757 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
758
759 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
760 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
761 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
762 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
763 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
764 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
765
767 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
769 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
770 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
771 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
772
774 MVT::v2i32, Expand);
776
778 MVT::v4i32, Expand);
779
781 MVT::v8i32, Expand);
782
783 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784 Subtarget->hasVOP3PInsts() ? Legal : Custom);
785
786 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
787 // This isn't really legal, but this avoids the legalizer unrolling it (and
788 // allows matching fneg (fabs x) patterns)
789 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
790
791 // Can do this in one BFI plus a constant materialize.
793 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
794 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
795 MVT::v32f16, MVT::v32bf16},
796 Custom);
797
800 MVT::f16, Custom);
802
805 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
806 Custom);
807
809 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
810 Expand);
811
812 for (MVT Vec16 :
813 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
814 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
817 Vec16, Custom);
819 }
820 }
821
822 if (Subtarget->hasVOP3PInsts()) {
826 MVT::v2i16, Legal);
827
830 MVT::v2f16, Legal);
831
833 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
834
836 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
837 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
838 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
839 Custom);
840
841 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
842 // Split vector operations.
847 VT, Custom);
848
849 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
850 // Split vector operations.
852 VT, Custom);
853
856 {MVT::v2f16, MVT::v4f16}, Custom);
857
858 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
859 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
860 Custom);
861
862 if (Subtarget->hasBF16PackedInsts()) {
863 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
864 // Split vector operations.
866 VT, Custom);
867 }
868
869 if (Subtarget->hasPackedFP32Ops()) {
871 MVT::v2f32, Legal);
873 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
874 Custom);
875 }
876 }
877
879
880 if (Subtarget->has16BitInsts()) {
882 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
884 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
885 } else {
886 // Legalization hack.
887 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
888
890 }
891
893 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
894 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
895 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
896 MVT::v32f16, MVT::v32bf16},
897 Custom);
898
900
901 if (Subtarget->hasVectorMulU64())
903 else if (Subtarget->hasScalarSMulU64())
905
906 if (Subtarget->hasMad64_32())
908
909 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
911
912 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
914 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
915 } else {
916 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
917 if (Subtarget->hasMinimum3Maximum3F32())
919
920 if (Subtarget->hasMinimum3Maximum3PKF16()) {
922
923 // If only the vector form is available, we need to widen to a vector.
924 if (!Subtarget->hasMinimum3Maximum3F16())
926 }
927 }
928
929 if (Subtarget->hasVOP3PInsts()) {
930 // We want to break these into v2f16 pieces, not scalarize.
932 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
933 Custom);
934 }
935
936 if (Subtarget->hasIntMinMax64())
938 Legal);
939
941 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
942 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
943 MVT::i8},
944 Custom);
945
947 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
948 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
949 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
950 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
951 Custom);
952
954 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
955 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
956 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
957 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
958 Custom);
959
965
966 // TODO: Could move this to custom lowering, could benefit from combines on
967 // extract of relevant bits.
969
971
972 if (Subtarget->hasBF16ConversionInsts()) {
973 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
975 }
976
977 if (Subtarget->hasBF16PackedInsts()) {
980 MVT::v2bf16, Legal);
981 }
982
983 if (Subtarget->hasBF16TransInsts()) {
985 }
986
987 if (Subtarget->hasCvtPkF16F32Inst()) {
989 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
990 Custom);
991 }
992
996 ISD::SUB,
998 ISD::MUL,
999 ISD::FADD,
1000 ISD::FSUB,
1001 ISD::FDIV,
1002 ISD::FMUL,
1011 ISD::FMA,
1012 ISD::SMIN,
1013 ISD::SMAX,
1014 ISD::UMIN,
1015 ISD::UMAX,
1016 ISD::SETCC,
1018 ISD::SMIN,
1019 ISD::SMAX,
1020 ISD::UMIN,
1021 ISD::UMAX,
1022 ISD::AND,
1023 ISD::OR,
1024 ISD::XOR,
1025 ISD::SHL,
1026 ISD::SRL,
1027 ISD::SRA,
1028 ISD::FSHR,
1039
1040 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1042
1043 // All memory operations. Some folding on the pointer operand is done to help
1044 // matching the constant offsets in the addressing modes.
1046 ISD::STORE,
1071
1072 // FIXME: In other contexts we pretend this is a per-function property.
1074
1076}
1077
1078const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1079
1081 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1082 return RCRegs;
1083}
1084
1085//===----------------------------------------------------------------------===//
1086// TargetLowering queries
1087//===----------------------------------------------------------------------===//
1088
1089// v_mad_mix* support a conversion from f16 to f32.
1090//
1091// There is only one special case when denormals are enabled we don't currently,
1092// where this is OK to use.
1093bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1094 EVT DestVT, EVT SrcVT) const {
1095 return DestVT.getScalarType() == MVT::f32 &&
1096 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1097 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1098 SrcVT.getScalarType() == MVT::f16) ||
1099 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1100 SrcVT.getScalarType() == MVT::bf16)) &&
1101 // TODO: This probably only requires no input flushing?
1103}
1104
1106 LLT DestTy, LLT SrcTy) const {
1107 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1108 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1109 DestTy.getScalarSizeInBits() == 32 &&
1110 SrcTy.getScalarSizeInBits() == 16 &&
1111 // TODO: This probably only requires no input flushing?
1112 denormalModeIsFlushAllF32(*MI.getMF());
1113}
1114
1116 // SI has some legal vector types, but no legal vector operations. Say no
1117 // shuffles are legal in order to prefer scalarizing some vector operations.
1118 return false;
1119}
1120
1122 CallingConv::ID CC,
1123 EVT VT) const {
1125 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1126
1127 if (VT.isVector()) {
1128 EVT ScalarVT = VT.getScalarType();
1129 unsigned Size = ScalarVT.getSizeInBits();
1130 if (Size == 16) {
1131 return Subtarget->has16BitInsts()
1132 ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2)
1133 : MVT::i32;
1134 }
1135
1136 if (Size < 16)
1137 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1138 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1139 }
1140
1141 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1142 return MVT::i32;
1143
1144 if (VT.getSizeInBits() > 32)
1145 return MVT::i32;
1146
1147 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1148}
1149
1151 CallingConv::ID CC,
1152 EVT VT) const {
1154 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1155
1156 if (VT.isVector()) {
1157 unsigned NumElts = VT.getVectorNumElements();
1158 EVT ScalarVT = VT.getScalarType();
1159 unsigned Size = ScalarVT.getSizeInBits();
1160
1161 // FIXME: Should probably promote 8-bit vectors to i16.
1162 if (Size == 16)
1163 return (NumElts + 1) / 2;
1164
1165 if (Size <= 32)
1166 return NumElts;
1167
1168 if (Size > 32)
1169 return NumElts * ((Size + 31) / 32);
1170 } else if (VT.getSizeInBits() > 32)
1171 return (VT.getSizeInBits() + 31) / 32;
1172
1173 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1174}
1175
1177 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1178 unsigned &NumIntermediates, MVT &RegisterVT) const {
1179 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1180 unsigned NumElts = VT.getVectorNumElements();
1181 EVT ScalarVT = VT.getScalarType();
1182 unsigned Size = ScalarVT.getSizeInBits();
1183 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1184 // support, but unless we can properly handle 3-vectors, it will be still be
1185 // inconsistent.
1186 if (Size == 16) {
1187 MVT SimpleIntermediateVT =
1189 IntermediateVT = SimpleIntermediateVT;
1190 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1191 NumIntermediates = (NumElts + 1) / 2;
1192 return (NumElts + 1) / 2;
1193 }
1194
1195 if (Size == 32) {
1196 RegisterVT = ScalarVT.getSimpleVT();
1197 IntermediateVT = RegisterVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1200 }
1201
1202 if (Size < 16 && Subtarget->has16BitInsts()) {
1203 // FIXME: Should probably form v2i16 pieces
1204 RegisterVT = MVT::i16;
1205 IntermediateVT = ScalarVT;
1206 NumIntermediates = NumElts;
1207 return NumIntermediates;
1208 }
1209
1210 if (Size != 16 && Size <= 32) {
1211 RegisterVT = MVT::i32;
1212 IntermediateVT = ScalarVT;
1213 NumIntermediates = NumElts;
1214 return NumIntermediates;
1215 }
1216
1217 if (Size > 32) {
1218 RegisterVT = MVT::i32;
1219 IntermediateVT = RegisterVT;
1220 NumIntermediates = NumElts * ((Size + 31) / 32);
1221 return NumIntermediates;
1222 }
1223 }
1224
1226 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1227}
1228
1230 const DataLayout &DL, Type *Ty,
1231 unsigned MaxNumLanes) {
1232 assert(MaxNumLanes != 0);
1233
1234 LLVMContext &Ctx = Ty->getContext();
1235 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1236 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1237 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1238 NumElts);
1239 }
1240
1241 return TLI.getValueType(DL, Ty);
1242}
1243
1244// Peek through TFE struct returns to only use the data size.
1246 const DataLayout &DL, Type *Ty,
1247 unsigned MaxNumLanes) {
1248 auto *ST = dyn_cast<StructType>(Ty);
1249 if (!ST)
1250 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1251
1252 // TFE intrinsics return an aggregate type.
1253 assert(ST->getNumContainedTypes() == 2 &&
1254 ST->getContainedType(1)->isIntegerTy(32));
1255 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1256}
1257
1258/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1259/// in-memory representation. This return value is a custom type because there
1260/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1261/// could cause issues during codegen, these address space 7 pointers will be
1262/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1263/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1264/// for cost modeling, to work. (This also sets us up decently for doing the
1265/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1267 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1268 return MVT::amdgpuBufferFatPointer;
1270 DL.getPointerSizeInBits(AS) == 192)
1271 return MVT::amdgpuBufferStridedPointer;
1273}
1274/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1275/// v8i32 when padding is added.
1276/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1277/// also v8i32 with padding.
1279 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1280 DL.getPointerSizeInBits(AS) == 160) ||
1282 DL.getPointerSizeInBits(AS) == 192))
1283 return MVT::v8i32;
1285}
1286
1287static unsigned getIntrMemWidth(unsigned IntrID) {
1288 switch (IntrID) {
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1292 return 8;
1293 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1294 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1295 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1296 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1297 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1298 case Intrinsic::amdgcn_flat_load_monitor_b32:
1299 case Intrinsic::amdgcn_global_load_monitor_b32:
1300 return 32;
1301 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1302 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1303 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1304 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1305 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1306 case Intrinsic::amdgcn_flat_load_monitor_b64:
1307 case Intrinsic::amdgcn_global_load_monitor_b64:
1308 return 64;
1309 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1310 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1311 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1312 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1313 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1314 case Intrinsic::amdgcn_flat_load_monitor_b128:
1315 case Intrinsic::amdgcn_global_load_monitor_b128:
1316 return 128;
1317 default:
1318 llvm_unreachable("Unknown width");
1319 }
1320}
1321
1323 unsigned ArgIdx) {
1324 Value *OrderingArg = CI.getArgOperand(ArgIdx);
1325 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1326 switch (AtomicOrderingCABI(Ord)) {
1329 break;
1332 break;
1335 break;
1336 default:
1338 }
1339}
1340
1341static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
1342 MDNode *ScopeMD = cast<MDNode>(
1343 cast<MetadataAsValue>(CI.getArgOperand(ArgIdx))->getMetadata());
1344 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1345 return CI.getContext().getOrInsertSyncScopeID(Scope);
1346}
1347
1349 const CallBase &CI,
1350 MachineFunction &MF,
1351 unsigned IntrID) const {
1353 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1355 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1357 Flags |= getTargetMMOFlags(CI);
1358
1359 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1361 AttributeSet Attr =
1363 MemoryEffects ME = Attr.getMemoryEffects();
1364 if (ME.doesNotAccessMemory())
1365 return;
1366
1367 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1368 if (!IsSPrefetch) {
1369 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1370 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1372 }
1374
1375 IntrinsicInfo Info;
1376 // TODO: Should images get their own address space?
1378
1379 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1380 if (RsrcIntr->IsImage) {
1381 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1383 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1384 Info.align.reset();
1385 }
1386
1387 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1388 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1389 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1390 // We conservatively set the memory operand of a buffer intrinsic to the
1391 // base resource pointer, so that we can access alias information about
1392 // those pointers. Cases like "this points at the same value
1393 // but with a different offset" are handled in
1394 // areMemAccessesTriviallyDisjoint.
1395 Info.ptrVal = RsrcArg;
1396 }
1397
1398 if (ME.onlyReadsMemory()) {
1399 if (RsrcIntr->IsImage) {
1400 unsigned MaxNumLanes = 4;
1401
1402 if (!BaseOpcode->Gather4) {
1403 // If this isn't a gather, we may have excess loaded elements in the
1404 // IR type. Check the dmask for the real number of elements loaded.
1405 unsigned DMask =
1406 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1407 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1408 }
1409
1410 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1411 CI.getType(), MaxNumLanes);
1412 } else {
1413 Info.memVT =
1415 std::numeric_limits<unsigned>::max());
1416 }
1417
1418 // FIXME: What does alignment mean for an image?
1419 Info.opc = ISD::INTRINSIC_W_CHAIN;
1420 Info.flags = Flags | MachineMemOperand::MOLoad;
1421 } else if (ME.onlyWritesMemory()) {
1422 Info.opc = ISD::INTRINSIC_VOID;
1423
1424 Type *DataTy = CI.getArgOperand(0)->getType();
1425 if (RsrcIntr->IsImage) {
1426 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1427 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1428 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1429 DMaskLanes);
1430 } else
1431 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1432
1433 Info.flags = Flags | MachineMemOperand::MOStore;
1434 } else {
1435 // Atomic, NoReturn Sampler or prefetch
1436 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1438
1439 switch (IntrID) {
1440 default:
1441 Info.flags = Flags | MachineMemOperand::MOLoad;
1442 if (!IsSPrefetch)
1443 Info.flags |= MachineMemOperand::MOStore;
1444
1445 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1446 // Fake memory access type for no return sampler intrinsics
1447 Info.memVT = MVT::i32;
1448 } else {
1449 // XXX - Should this be volatile without known ordering?
1450 Info.flags |= MachineMemOperand::MOVolatile;
1451 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1452 }
1453 break;
1454 case Intrinsic::amdgcn_raw_buffer_load_lds:
1455 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1456 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1457 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1458 case Intrinsic::amdgcn_struct_buffer_load_lds:
1459 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1460 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1461 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1462 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1463
1464 // Entry 0: Load from buffer.
1465 // Don't set an offset, since the pointer value always represents the
1466 // base of the buffer.
1467 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1468 Info.flags = Flags | MachineMemOperand::MOLoad;
1469 Infos.push_back(Info);
1470
1471 // Entry 1: Store to LDS.
1472 // Instruction offset is applied, and an additional per-lane offset
1473 // which we simulate using a larger memory type.
1474 Info.memVT = EVT::getIntegerVT(
1475 CI.getContext(), Width * 8 * Subtarget->getWavefrontSize());
1476 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1477 Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))
1478 ->getZExtValue();
1479 Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
1480 Info.flags = Flags | MachineMemOperand::MOStore;
1481 Infos.push_back(Info);
1482 return;
1483 }
1484 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1485 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1486 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1487 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1488 Info.memVT =
1490 std::numeric_limits<unsigned>::max());
1491 Info.flags = Flags | MachineMemOperand::MOLoad;
1492 Infos.push_back(Info);
1493 return;
1494 }
1495 }
1496 }
1497 Infos.push_back(Info);
1498 return;
1499 }
1500
1501 IntrinsicInfo Info;
1502 switch (IntrID) {
1503 case Intrinsic::amdgcn_ds_ordered_add:
1504 case Intrinsic::amdgcn_ds_ordered_swap: {
1505 Info.opc = ISD::INTRINSIC_W_CHAIN;
1506 Info.memVT = MVT::getVT(CI.getType());
1507 Info.ptrVal = CI.getOperand(0);
1508 Info.align.reset();
1510
1511 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1512 if (!Vol->isZero())
1513 Info.flags |= MachineMemOperand::MOVolatile;
1514
1515 Infos.push_back(Info);
1516 return;
1517 }
1518 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1519 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1520 Info.opc = ISD::INTRINSIC_W_CHAIN;
1521 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1522 Info.ptrVal = nullptr;
1523 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1525 Infos.push_back(Info);
1526 return;
1527 }
1528 case Intrinsic::amdgcn_ds_append:
1529 case Intrinsic::amdgcn_ds_consume: {
1530 Info.opc = ISD::INTRINSIC_W_CHAIN;
1531 Info.memVT = MVT::getVT(CI.getType());
1532 Info.ptrVal = CI.getOperand(0);
1533 Info.align.reset();
1535
1536 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1537 if (!Vol->isZero())
1538 Info.flags |= MachineMemOperand::MOVolatile;
1539
1540 Infos.push_back(Info);
1541 return;
1542 }
1543 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1544 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1545 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1548 Info.memVT = MVT::getVT(CI.getType());
1549 Info.ptrVal = CI.getOperand(0);
1550 Info.memVT = MVT::i64;
1551 Info.size = 8;
1552 Info.align.reset();
1554 Infos.push_back(Info);
1555 return;
1556 }
1557 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1558 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1559 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1560 Info.opc = ISD::INTRINSIC_W_CHAIN;
1561 Info.memVT =
1562 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1563 ? CI.getType()
1565 ->getElementType(0)); // XXX: what is correct VT?
1566
1567 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1568 Info.align.reset();
1569 Info.flags = Flags | MachineMemOperand::MOLoad |
1571 Infos.push_back(Info);
1572 return;
1573 }
1574 case Intrinsic::amdgcn_global_atomic_fmin_num:
1575 case Intrinsic::amdgcn_global_atomic_fmax_num:
1576 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1577 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1578 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1579 Info.opc = ISD::INTRINSIC_W_CHAIN;
1580 Info.memVT = MVT::getVT(CI.getType());
1581 Info.ptrVal = CI.getOperand(0);
1582 Info.align.reset();
1583 Info.flags =
1586 Infos.push_back(Info);
1587 return;
1588 }
1589 case Intrinsic::amdgcn_cluster_load_b32:
1590 case Intrinsic::amdgcn_cluster_load_b64:
1591 case Intrinsic::amdgcn_cluster_load_b128:
1592 case Intrinsic::amdgcn_ds_load_tr6_b96:
1593 case Intrinsic::amdgcn_ds_load_tr4_b64:
1594 case Intrinsic::amdgcn_ds_load_tr8_b64:
1595 case Intrinsic::amdgcn_ds_load_tr16_b128:
1596 case Intrinsic::amdgcn_global_load_tr6_b96:
1597 case Intrinsic::amdgcn_global_load_tr4_b64:
1598 case Intrinsic::amdgcn_global_load_tr_b64:
1599 case Intrinsic::amdgcn_global_load_tr_b128:
1600 case Intrinsic::amdgcn_ds_read_tr4_b64:
1601 case Intrinsic::amdgcn_ds_read_tr6_b96:
1602 case Intrinsic::amdgcn_ds_read_tr8_b64:
1603 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1604 Info.opc = ISD::INTRINSIC_W_CHAIN;
1605 Info.memVT = MVT::getVT(CI.getType());
1606 Info.ptrVal = CI.getOperand(0);
1607 Info.align.reset();
1608 Info.flags = Flags | MachineMemOperand::MOLoad;
1609 Infos.push_back(Info);
1610 return;
1611 }
1612 case Intrinsic::amdgcn_flat_load_monitor_b32:
1613 case Intrinsic::amdgcn_flat_load_monitor_b64:
1614 case Intrinsic::amdgcn_flat_load_monitor_b128:
1615 case Intrinsic::amdgcn_global_load_monitor_b32:
1616 case Intrinsic::amdgcn_global_load_monitor_b64:
1617 case Intrinsic::amdgcn_global_load_monitor_b128: {
1618 Info.opc = ISD::INTRINSIC_W_CHAIN;
1619 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1620 Info.ptrVal = CI.getOperand(0);
1621 Info.align.reset();
1622 Info.flags = MachineMemOperand::MOLoad;
1623 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1624 Info.ssid = parseSyncscopeMDArg(CI, 2);
1625 Infos.push_back(Info);
1626 return;
1627 }
1628 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1629 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1630 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1631 Info.opc = ISD::INTRINSIC_W_CHAIN;
1632 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1633 Info.ptrVal = CI.getOperand(0);
1634 Info.align.reset();
1636 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1637 Info.ssid = parseSyncscopeMDArg(CI, 2);
1638 Infos.push_back(Info);
1639 return;
1640 }
1641 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1642 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1643 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1644 Info.opc = ISD::INTRINSIC_VOID;
1645 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1646 Info.ptrVal = CI.getArgOperand(0);
1647 Info.align.reset();
1649 Info.order = parseAtomicOrderingCABIArg(CI, 2);
1650 Info.ssid = parseSyncscopeMDArg(CI, 3);
1651 Infos.push_back(Info);
1652 return;
1653 }
1654 case Intrinsic::amdgcn_ds_gws_init:
1655 case Intrinsic::amdgcn_ds_gws_barrier:
1656 case Intrinsic::amdgcn_ds_gws_sema_v:
1657 case Intrinsic::amdgcn_ds_gws_sema_br:
1658 case Intrinsic::amdgcn_ds_gws_sema_p:
1659 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1660 Info.opc = ISD::INTRINSIC_VOID;
1661
1662 const GCNTargetMachine &TM =
1663 static_cast<const GCNTargetMachine &>(getTargetMachine());
1664
1666 Info.ptrVal = MFI->getGWSPSV(TM);
1667
1668 // This is an abstract access, but we need to specify a type and size.
1669 Info.memVT = MVT::i32;
1670 Info.size = 4;
1671 Info.align = Align(4);
1672
1673 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1674 Info.flags = Flags | MachineMemOperand::MOLoad;
1675 else
1676 Info.flags = Flags | MachineMemOperand::MOStore;
1677 Infos.push_back(Info);
1678 return;
1679 }
1680 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1681 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1682 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1683 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1684 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1685 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1686 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1687 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1688 // Entry 0: Load from source (global/flat).
1689 Info.opc = ISD::INTRINSIC_VOID;
1690 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1691 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1692 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1693 Info.flags = Flags | MachineMemOperand::MOLoad;
1694 Infos.push_back(Info);
1695
1696 // Entry 1: Store to LDS (same offset).
1697 Info.flags = Flags | MachineMemOperand::MOStore;
1698 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1699 Infos.push_back(Info);
1700 return;
1701 }
1702 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1703 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1704 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1705 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1706 // Entry 0: Load from LDS.
1707 Info.opc = ISD::INTRINSIC_VOID;
1708 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1709 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1710 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1711 Info.flags = Flags | MachineMemOperand::MOLoad;
1712 Infos.push_back(Info);
1713
1714 // Entry 1: Store to global (same offset).
1715 Info.flags = Flags | MachineMemOperand::MOStore;
1716 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1717 Infos.push_back(Info);
1718 return;
1719 }
1720 case Intrinsic::amdgcn_load_to_lds:
1721 case Intrinsic::amdgcn_load_async_to_lds:
1722 case Intrinsic::amdgcn_global_load_lds:
1723 case Intrinsic::amdgcn_global_load_async_lds: {
1724 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1725 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1726 bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
1727 if (IsVolatile)
1729
1730 // Entry 0: Load from source (global/flat).
1731 Info.opc = ISD::INTRINSIC_VOID;
1732 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1733 Info.ptrVal = CI.getArgOperand(0); // Source pointer
1734 Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
1735 Info.flags = Flags | MachineMemOperand::MOLoad;
1736 Infos.push_back(Info);
1737
1738 // Entry 1: Store to LDS.
1739 // Same offset from the instruction, but an additional per-lane offset is
1740 // added. Represent that using a wider memory type.
1741 Info.memVT = EVT::getIntegerVT(CI.getContext(),
1742 Width * 8 * Subtarget->getWavefrontSize());
1743 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1744 Info.flags = Flags | MachineMemOperand::MOStore;
1745 Infos.push_back(Info);
1746 return;
1747 }
1748 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1749 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1750 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1751 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1752 Info.opc = ISD::INTRINSIC_W_CHAIN;
1753
1754 const GCNTargetMachine &TM =
1755 static_cast<const GCNTargetMachine &>(getTargetMachine());
1756
1758 Info.ptrVal = MFI->getGWSPSV(TM);
1759
1760 // This is an abstract access, but we need to specify a type and size.
1761 Info.memVT = MVT::i32;
1762 Info.size = 4;
1763 Info.align = Align(4);
1764
1766 Infos.push_back(Info);
1767 return;
1768 }
1769 case Intrinsic::amdgcn_s_prefetch_data:
1770 case Intrinsic::amdgcn_flat_prefetch:
1771 case Intrinsic::amdgcn_global_prefetch: {
1772 Info.opc = ISD::INTRINSIC_VOID;
1773 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1774 Info.ptrVal = CI.getArgOperand(0);
1775 Info.flags = Flags | MachineMemOperand::MOLoad;
1776 Infos.push_back(Info);
1777 return;
1778 }
1779 default:
1780 return;
1781 }
1782}
1783
1785 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1787 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1788 // The DAG's ValueType loses the addrspaces.
1789 // Add them as 2 extra Constant operands "from" and "to".
1790 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1791 unsigned DstAS = I.getType()->getPointerAddressSpace();
1792 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1793 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1794 break;
1795 }
1796 default:
1797 break;
1798 }
1799}
1800
1803 Type *&AccessTy) const {
1804 Value *Ptr = nullptr;
1805 switch (II->getIntrinsicID()) {
1806 case Intrinsic::amdgcn_cluster_load_b128:
1807 case Intrinsic::amdgcn_cluster_load_b64:
1808 case Intrinsic::amdgcn_cluster_load_b32:
1809 case Intrinsic::amdgcn_ds_append:
1810 case Intrinsic::amdgcn_ds_consume:
1811 case Intrinsic::amdgcn_ds_load_tr8_b64:
1812 case Intrinsic::amdgcn_ds_load_tr16_b128:
1813 case Intrinsic::amdgcn_ds_load_tr4_b64:
1814 case Intrinsic::amdgcn_ds_load_tr6_b96:
1815 case Intrinsic::amdgcn_ds_read_tr4_b64:
1816 case Intrinsic::amdgcn_ds_read_tr6_b96:
1817 case Intrinsic::amdgcn_ds_read_tr8_b64:
1818 case Intrinsic::amdgcn_ds_read_tr16_b64:
1819 case Intrinsic::amdgcn_ds_ordered_add:
1820 case Intrinsic::amdgcn_ds_ordered_swap:
1821 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1822 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1823 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1824 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1825 case Intrinsic::amdgcn_global_atomic_fmax_num:
1826 case Intrinsic::amdgcn_global_atomic_fmin_num:
1827 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1828 case Intrinsic::amdgcn_global_load_tr_b64:
1829 case Intrinsic::amdgcn_global_load_tr_b128:
1830 case Intrinsic::amdgcn_global_load_tr4_b64:
1831 case Intrinsic::amdgcn_global_load_tr6_b96:
1832 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1833 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1834 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1835 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1836 Ptr = II->getArgOperand(0);
1837 break;
1838 case Intrinsic::amdgcn_load_to_lds:
1839 case Intrinsic::amdgcn_load_async_to_lds:
1840 case Intrinsic::amdgcn_global_load_lds:
1841 case Intrinsic::amdgcn_global_load_async_lds:
1842 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1843 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1844 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1845 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1846 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1847 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1848 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1849 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1850 Ptr = II->getArgOperand(1);
1851 break;
1852 default:
1853 return false;
1854 }
1855 AccessTy = II->getType();
1856 Ops.push_back(Ptr);
1857 return true;
1858}
1859
1861 unsigned AddrSpace) const {
1862 if (!Subtarget->hasFlatInstOffsets()) {
1863 // Flat instructions do not have offsets, and only have the register
1864 // address.
1865 return AM.BaseOffs == 0 && AM.Scale == 0;
1866 }
1867
1868 decltype(SIInstrFlags::FLAT) FlatVariant =
1872
1873 return AM.Scale == 0 &&
1874 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1875 AM.BaseOffs, AddrSpace, FlatVariant));
1876}
1877
1879 if (Subtarget->hasFlatGlobalInsts())
1881
1882 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1883 // Assume the we will use FLAT for all global memory accesses
1884 // on VI.
1885 // FIXME: This assumption is currently wrong. On VI we still use
1886 // MUBUF instructions for the r + i addressing mode. As currently
1887 // implemented, the MUBUF instructions only work on buffer < 4GB.
1888 // It may be possible to support > 4GB buffers with MUBUF instructions,
1889 // by setting the stride value in the resource descriptor which would
1890 // increase the size limit to (stride * 4GB). However, this is risky,
1891 // because it has never been validated.
1893 }
1894
1895 return isLegalMUBUFAddressingMode(AM);
1896}
1897
1898bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1899 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1900 // additionally can do r + r + i with addr64. 32-bit has more addressing
1901 // mode options. Depending on the resource constant, it can also do
1902 // (i64 r0) + (i32 r1) * (i14 i).
1903 //
1904 // Private arrays end up using a scratch buffer most of the time, so also
1905 // assume those use MUBUF instructions. Scratch loads / stores are currently
1906 // implemented as mubuf instructions with offen bit set, so slightly
1907 // different than the normal addr64.
1908 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1909 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1910 return false;
1911
1912 // FIXME: Since we can split immediate into soffset and immediate offset,
1913 // would it make sense to allow any immediate?
1914
1915 switch (AM.Scale) {
1916 case 0: // r + i or just i, depending on HasBaseReg.
1917 return true;
1918 case 1:
1919 return true; // We have r + r or r + i.
1920 case 2:
1921 if (AM.HasBaseReg) {
1922 // Reject 2 * r + r.
1923 return false;
1924 }
1925
1926 // Allow 2 * r as r + r
1927 // Or 2 * r + i is allowed as r + r + i.
1928 return true;
1929 default: // Don't allow n * r
1930 return false;
1931 }
1932}
1933
1935 const AddrMode &AM, Type *Ty,
1936 unsigned AS,
1937 Instruction *I) const {
1938 // No global is ever allowed as a base.
1939 if (AM.BaseGV)
1940 return false;
1941
1942 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1943 return isLegalGlobalAddressingMode(AM);
1944
1945 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1949 // If the offset isn't a multiple of 4, it probably isn't going to be
1950 // correctly aligned.
1951 // FIXME: Can we get the real alignment here?
1952 if (AM.BaseOffs % 4 != 0)
1953 return isLegalMUBUFAddressingMode(AM);
1954
1955 if (!Subtarget->hasScalarSubwordLoads()) {
1956 // There are no SMRD extloads, so if we have to do a small type access we
1957 // will use a MUBUF load.
1958 // FIXME?: We also need to do this if unaligned, but we don't know the
1959 // alignment here.
1960 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1961 return isLegalGlobalAddressingMode(AM);
1962 }
1963
1964 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1965 // SMRD instructions have an 8-bit, dword offset on SI.
1966 if (!isUInt<8>(AM.BaseOffs / 4))
1967 return false;
1968 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1969 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1970 // in 8-bits, it can use a smaller encoding.
1971 if (!isUInt<32>(AM.BaseOffs / 4))
1972 return false;
1973 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1974 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1975 if (!isUInt<20>(AM.BaseOffs))
1976 return false;
1977 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1978 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1979 // for S_BUFFER_* instructions).
1980 if (!isInt<21>(AM.BaseOffs))
1981 return false;
1982 } else {
1983 // On GFX12, all offsets are signed 24-bit in bytes.
1984 if (!isInt<24>(AM.BaseOffs))
1985 return false;
1986 }
1987
1988 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1990 AM.BaseOffs < 0) {
1991 // Scalar (non-buffer) loads can only use a negative offset if
1992 // soffset+offset is non-negative. Since the compiler can only prove that
1993 // in a few special cases, it is safer to claim that negative offsets are
1994 // not supported.
1995 return false;
1996 }
1997
1998 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1999 return true;
2000
2001 if (AM.Scale == 1 && AM.HasBaseReg)
2002 return true;
2003
2004 return false;
2005 }
2006
2007 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
2008 return Subtarget->hasFlatScratchEnabled()
2010 : isLegalMUBUFAddressingMode(AM);
2011
2012 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
2013 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
2014 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
2015 // field.
2016 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
2017 // an 8-bit dword offset but we don't know the alignment here.
2018 if (!isUInt<16>(AM.BaseOffs))
2019 return false;
2020
2021 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2022 return true;
2023
2024 if (AM.Scale == 1 && AM.HasBaseReg)
2025 return true;
2026
2027 return false;
2028 }
2029
2031 // For an unknown address space, this usually means that this is for some
2032 // reason being used for pure arithmetic, and not based on some addressing
2033 // computation. We don't have instructions that compute pointers with any
2034 // addressing modes, so treat them as having no offset like flat
2035 // instructions.
2037 }
2038
2039 // Assume a user alias of global for unknown address spaces.
2040 return isLegalGlobalAddressingMode(AM);
2041}
2042
2044 const MachineFunction &MF) const {
2046 return (MemVT.getSizeInBits() <= 4 * 32);
2047 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
2048 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
2049 return (MemVT.getSizeInBits() <= MaxPrivateBits);
2050 }
2052 return (MemVT.getSizeInBits() <= 2 * 32);
2053 return true;
2054}
2055
2057 unsigned Size, unsigned AddrSpace, Align Alignment,
2058 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
2059 if (IsFast)
2060 *IsFast = 0;
2061
2062 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
2063 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2064 // Check if alignment requirements for ds_read/write instructions are
2065 // disabled.
2066 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
2067 return false;
2068
2069 Align RequiredAlignment(
2070 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
2071 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
2072 Alignment < RequiredAlignment)
2073 return false;
2074
2075 // Either, the alignment requirements are "enabled", or there is an
2076 // unaligned LDS access related hardware bug though alignment requirements
2077 // are "disabled". In either case, we need to check for proper alignment
2078 // requirements.
2079 //
2080 switch (Size) {
2081 case 64:
2082 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2083 // address is negative, then the instruction is incorrectly treated as
2084 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2085 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2086 // load later in the SILoadStoreOptimizer.
2087 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2088 return false;
2089
2090 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2091 // can do a 4 byte aligned, 8 byte access in a single operation using
2092 // ds_read2/write2_b32 with adjacent offsets.
2093 RequiredAlignment = Align(4);
2094
2095 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2096 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2097 // ds_write2_b32 depending on the alignment. In either case with either
2098 // alignment there is no faster way of doing this.
2099
2100 // The numbers returned here and below are not additive, it is a 'speed
2101 // rank'. They are just meant to be compared to decide if a certain way
2102 // of lowering an operation is faster than another. For that purpose
2103 // naturally aligned operation gets it bitsize to indicate that "it
2104 // operates with a speed comparable to N-bit wide load". With the full
2105 // alignment ds128 is slower than ds96 for example. If underaligned it
2106 // is comparable to a speed of a single dword access, which would then
2107 // mean 32 < 128 and it is faster to issue a wide load regardless.
2108 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2109 // wider load which will not be aligned anymore the latter is slower.
2110 if (IsFast)
2111 *IsFast = (Alignment >= RequiredAlignment) ? 64
2112 : (Alignment < Align(4)) ? 32
2113 : 1;
2114 return true;
2115 }
2116
2117 break;
2118 case 96:
2119 if (!Subtarget->hasDS96AndDS128())
2120 return false;
2121
2122 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2123 // gfx8 and older.
2124
2125 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2126 // Naturally aligned access is fastest. However, also report it is Fast
2127 // if memory is aligned less than DWORD. A narrow load or store will be
2128 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2129 // be more of them, so overall we will pay less penalty issuing a single
2130 // instruction.
2131
2132 // See comment on the values above.
2133 if (IsFast)
2134 *IsFast = (Alignment >= RequiredAlignment) ? 96
2135 : (Alignment < Align(4)) ? 32
2136 : 1;
2137 return true;
2138 }
2139
2140 break;
2141 case 128:
2142 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2143 return false;
2144
2145 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2146 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2147 // single operation using ds_read2/write2_b64.
2148 RequiredAlignment = Align(8);
2149
2150 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2151 // Naturally aligned access is fastest. However, also report it is Fast
2152 // if memory is aligned less than DWORD. A narrow load or store will be
2153 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2154 // will be more of them, so overall we will pay less penalty issuing a
2155 // single instruction.
2156
2157 // See comment on the values above.
2158 if (IsFast)
2159 *IsFast = (Alignment >= RequiredAlignment) ? 128
2160 : (Alignment < Align(4)) ? 32
2161 : 1;
2162 return true;
2163 }
2164
2165 break;
2166 default:
2167 if (Size > 32)
2168 return false;
2169
2170 break;
2171 }
2172
2173 // See comment on the values above.
2174 // Note that we have a single-dword or sub-dword here, so if underaligned
2175 // it is a slowest possible access, hence returned value is 0.
2176 if (IsFast)
2177 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2178
2179 return Alignment >= RequiredAlignment ||
2180 Subtarget->hasUnalignedDSAccessEnabled();
2181 }
2182
2183 // FIXME: We have to be conservative here and assume that flat operations
2184 // will access scratch. If we had access to the IR function, then we
2185 // could determine if any private memory was used in the function.
2186 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2187 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2188 bool AlignedBy4 = Alignment >= Align(4);
2189 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2190 if (IsFast)
2191 *IsFast = AlignedBy4 ? Size : 1;
2192 return true;
2193 }
2194
2195 if (IsFast)
2196 *IsFast = AlignedBy4;
2197
2198 return AlignedBy4;
2199 }
2200
2201 // So long as they are correct, wide global memory operations perform better
2202 // than multiple smaller memory ops -- even when misaligned
2203 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2204 if (IsFast)
2205 *IsFast = Size;
2206
2207 return Alignment >= Align(4) ||
2208 Subtarget->hasUnalignedBufferAccessEnabled();
2209 }
2210
2211 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2212 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2213 // out-of-bounds behavior, but in the edge case where an access starts
2214 // out-of-bounds and then enter in-bounds, the entire access would be treated
2215 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2216 // natural alignment of buffer accesses.
2217 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2218 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2219 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2220 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2221 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2222 return false;
2223 }
2224
2225 // Smaller than dword value must be aligned.
2226 if (Size < 32)
2227 return false;
2228
2229 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2230 // byte-address are ignored, thus forcing Dword alignment.
2231 // This applies to private, global, and constant memory.
2232 if (IsFast)
2233 *IsFast = 1;
2234
2235 return Size >= 32 && Alignment >= Align(4);
2236}
2237
2239 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2240 unsigned *IsFast) const {
2242 Alignment, Flags, IsFast);
2243}
2244
2246 LLVMContext &Context, const MemOp &Op,
2247 const AttributeList &FuncAttributes) const {
2248 // FIXME: Should account for address space here.
2249
2250 // The default fallback uses the private pointer size as a guess for a type to
2251 // use. Make sure we switch these to 64-bit accesses.
2252
2253 if (Op.size() >= 16 &&
2254 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2255 return MVT::v4i32;
2256
2257 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2258 return MVT::v2i32;
2259
2260 // Use the default.
2261 return MVT::Other;
2262}
2263
2265 const MemSDNode *MemNode = cast<MemSDNode>(N);
2266 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2267}
2268
2273
2275 unsigned DestAS) const {
2276 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2277 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2278 Subtarget->hasGloballyAddressableScratch()) {
2279 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2280 return false;
2281 }
2282
2283 // Flat -> private/local is a simple truncate.
2284 // Flat -> global is no-op
2285 return true;
2286 }
2287
2288 const GCNTargetMachine &TM =
2289 static_cast<const GCNTargetMachine &>(getTargetMachine());
2290 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2291}
2292
2300
2302 Type *Ty) const {
2303 // FIXME: Could be smarter if called for vector constants.
2304 return true;
2305}
2306
2308 unsigned Index) const {
2310 return false;
2311
2312 // TODO: Add more cases that are cheap.
2313 return Index == 0;
2314}
2315
2316bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2317 // TODO: This should be more aggressive, particular for 16-bit element
2318 // vectors. However there are some mixed improvements and regressions.
2319 EVT EltTy = VT.getVectorElementType();
2320 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2321 return EltTy.getSizeInBits() % MinAlign == 0;
2322}
2323
2325 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2326 switch (Op) {
2327 case ISD::LOAD:
2328 case ISD::STORE:
2329 return true;
2330 default:
2331 return false;
2332 }
2333 }
2334
2335 // SimplifySetCC uses this function to determine whether or not it should
2336 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2337 if (VT == MVT::i1 && Op == ISD::SETCC)
2338 return false;
2339
2341}
2342
2345 // This isn't really a constant pool but close enough.
2348 return PtrInfo;
2349}
2350
2351SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2352 const SDLoc &SL,
2353 SDValue Chain,
2354 uint64_t Offset) const {
2355 const DataLayout &DL = DAG.getDataLayout();
2359
2360 auto [InputPtrReg, RC, ArgTy] =
2361 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2362
2363 // We may not have the kernarg segment argument if we have no kernel
2364 // arguments.
2365 if (!InputPtrReg)
2366 return DAG.getConstant(Offset, SL, PtrVT);
2367
2369 SDValue BasePtr = DAG.getCopyFromReg(
2370 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2371
2372 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2373}
2374
2375SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2376 const SDLoc &SL) const {
2379 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2380}
2381
2382SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2383 const SDLoc &SL) const {
2384
2386 std::optional<uint32_t> KnownSize =
2388 if (KnownSize.has_value())
2389 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2390 return SDValue();
2391}
2392
2393SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2394 const SDLoc &SL, SDValue Val,
2395 bool Signed,
2396 const ISD::InputArg *Arg) const {
2397 // First, if it is a widened vector, narrow it.
2398 if (VT.isVector() &&
2400 EVT NarrowedVT =
2403 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2404 DAG.getConstant(0, SL, MVT::i32));
2405 }
2406
2407 // Then convert the vector elements or scalar value.
2408 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2409 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2410 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2411 }
2412
2413 if (MemVT.isFloatingPoint()) {
2414 if (VT.isFloatingPoint()) {
2415 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2416 } else {
2417 assert(!MemVT.isVector());
2418 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
2419 SDValue Cast = DAG.getBitcast(IntVT, Val);
2420 Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);
2421 }
2422 } else if (Signed)
2423 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2424 else
2425 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2426
2427 return Val;
2428}
2429
2430SDValue SITargetLowering::lowerKernargMemParameter(
2431 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2432 uint64_t Offset, Align Alignment, bool Signed,
2433 const ISD::InputArg *Arg) const {
2434
2435 MachinePointerInfo PtrInfo =
2437
2438 // Try to avoid using an extload by loading earlier than the argument address,
2439 // and extracting the relevant bits. The load should hopefully be merged with
2440 // the previous argument.
2441 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2442 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2443 int64_t AlignDownOffset = alignDown(Offset, 4);
2444 int64_t OffsetDiff = Offset - AlignDownOffset;
2445
2446 EVT IntVT = MemVT.changeTypeToInteger();
2447
2448 // TODO: If we passed in the base kernel offset we could have a better
2449 // alignment than 4, but we don't really need it.
2450 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2451 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2452 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2455
2456 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2457 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2458
2459 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2460 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2461 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2462
2463 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2464 }
2465
2466 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2467 SDValue Load = DAG.getLoad(
2468 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2470
2471 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2472 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2473}
2474
2475/// Coerce an argument which was passed in a different ABI type to the original
2476/// expected value type.
2477SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2478 SDValue Val,
2479 CCValAssign &VA,
2480 const SDLoc &SL) const {
2481 EVT ValVT = VA.getValVT();
2482
2483 // If this is an 8 or 16-bit value, it is really passed promoted
2484 // to 32 bits. Insert an assert[sz]ext to capture this, then
2485 // truncate to the right size.
2486 switch (VA.getLocInfo()) {
2487 case CCValAssign::Full:
2488 return Val;
2489 case CCValAssign::BCvt:
2490 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2491 case CCValAssign::SExt:
2492 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2493 DAG.getValueType(ValVT));
2494 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2495 case CCValAssign::ZExt:
2496 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2497 DAG.getValueType(ValVT));
2498 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2499 case CCValAssign::AExt:
2500 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2501 default:
2502 llvm_unreachable("Unknown loc info!");
2503 }
2504}
2505
2506SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2507 CCValAssign &VA, const SDLoc &SL,
2508 SDValue Chain,
2509 const ISD::InputArg &Arg) const {
2510 MachineFunction &MF = DAG.getMachineFunction();
2511 MachineFrameInfo &MFI = MF.getFrameInfo();
2512
2513 if (Arg.Flags.isByVal()) {
2514 unsigned Size = Arg.Flags.getByValSize();
2515 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2516 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2517 }
2518
2519 unsigned ArgOffset = VA.getLocMemOffset();
2520 unsigned ArgSize = VA.getValVT().getStoreSize();
2521
2522 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2523
2524 // Create load nodes to retrieve arguments from the stack.
2525 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2526
2527 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2529 MVT MemVT = VA.getValVT();
2530
2531 switch (VA.getLocInfo()) {
2532 default:
2533 break;
2534 case CCValAssign::BCvt:
2535 MemVT = VA.getLocVT();
2536 break;
2537 case CCValAssign::SExt:
2538 ExtType = ISD::SEXTLOAD;
2539 break;
2540 case CCValAssign::ZExt:
2541 ExtType = ISD::ZEXTLOAD;
2542 break;
2543 case CCValAssign::AExt:
2544 ExtType = ISD::EXTLOAD;
2545 break;
2546 }
2547
2548 SDValue ArgValue = DAG.getExtLoad(
2549 ExtType, SL, VA.getLocVT(), Chain, FIN,
2551
2552 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2553 if (ConvertedVal == ArgValue)
2554 return ConvertedVal;
2555
2556 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2557}
2558
2559SDValue SITargetLowering::lowerWorkGroupId(
2560 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2563 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2564 if (!Subtarget->hasClusters())
2565 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2566
2567 // Clusters are supported. Return the global position in the grid. If clusters
2568 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2569
2570 // WorkGroupIdXYZ = ClusterId == 0 ?
2571 // ClusterIdXYZ :
2572 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2573 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2574 SDLoc SL(ClusterIdXYZ);
2575 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2576 SDValue One = DAG.getConstant(1, SL, VT);
2577 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2578 SDValue ClusterWorkGroupIdXYZ =
2579 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2580 SDValue GlobalIdXYZ =
2581 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2582 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2583
2584 switch (MFI.getClusterDims().getKind()) {
2587 return GlobalIdXYZ;
2589 return ClusterIdXYZ;
2591 using namespace AMDGPU::Hwreg;
2592 SDValue ClusterIdField =
2593 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2594 SDNode *GetReg =
2595 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2596 SDValue ClusterId(GetReg, 0);
2597 SDValue Zero = DAG.getConstant(0, SL, VT);
2598 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2599 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2600 }
2601 }
2602
2603 llvm_unreachable("nothing should reach here");
2604}
2605
2606SDValue SITargetLowering::getPreloadedValue(
2607 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2609 const ArgDescriptor *Reg = nullptr;
2610 const TargetRegisterClass *RC;
2611 LLT Ty;
2612
2614 const ArgDescriptor WorkGroupIDX =
2615 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2616 // If GridZ is not programmed in an entry function then the hardware will set
2617 // it to all zeros, so there is no need to mask the GridY value in the low
2618 // order bits.
2619 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2620 AMDGPU::TTMP7,
2621 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2622 const ArgDescriptor WorkGroupIDZ =
2623 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2624 const ArgDescriptor ClusterWorkGroupIDX =
2625 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2626 const ArgDescriptor ClusterWorkGroupIDY =
2627 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2628 const ArgDescriptor ClusterWorkGroupIDZ =
2629 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2630 const ArgDescriptor ClusterWorkGroupMaxIDX =
2631 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2632 const ArgDescriptor ClusterWorkGroupMaxIDY =
2633 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2634 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2635 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2636 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2637 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2638
2639 auto LoadConstant = [&](unsigned N) {
2640 return DAG.getConstant(N, SDLoc(), VT);
2641 };
2642
2643 if (Subtarget->hasArchitectedSGPRs() &&
2645 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2646 bool HasFixedDims = ClusterDims.isFixedDims();
2647
2648 switch (PVID) {
2650 Reg = &WorkGroupIDX;
2651 RC = &AMDGPU::SReg_32RegClass;
2652 Ty = LLT::scalar(32);
2653 break;
2655 Reg = &WorkGroupIDY;
2656 RC = &AMDGPU::SReg_32RegClass;
2657 Ty = LLT::scalar(32);
2658 break;
2660 Reg = &WorkGroupIDZ;
2661 RC = &AMDGPU::SReg_32RegClass;
2662 Ty = LLT::scalar(32);
2663 break;
2665 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2666 return LoadConstant(0);
2667 Reg = &ClusterWorkGroupIDX;
2668 RC = &AMDGPU::SReg_32RegClass;
2669 Ty = LLT::scalar(32);
2670 break;
2672 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2673 return LoadConstant(0);
2674 Reg = &ClusterWorkGroupIDY;
2675 RC = &AMDGPU::SReg_32RegClass;
2676 Ty = LLT::scalar(32);
2677 break;
2679 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2680 return LoadConstant(0);
2681 Reg = &ClusterWorkGroupIDZ;
2682 RC = &AMDGPU::SReg_32RegClass;
2683 Ty = LLT::scalar(32);
2684 break;
2686 if (HasFixedDims)
2687 return LoadConstant(ClusterDims.getDims()[0] - 1);
2688 Reg = &ClusterWorkGroupMaxIDX;
2689 RC = &AMDGPU::SReg_32RegClass;
2690 Ty = LLT::scalar(32);
2691 break;
2693 if (HasFixedDims)
2694 return LoadConstant(ClusterDims.getDims()[1] - 1);
2695 Reg = &ClusterWorkGroupMaxIDY;
2696 RC = &AMDGPU::SReg_32RegClass;
2697 Ty = LLT::scalar(32);
2698 break;
2700 if (HasFixedDims)
2701 return LoadConstant(ClusterDims.getDims()[2] - 1);
2702 Reg = &ClusterWorkGroupMaxIDZ;
2703 RC = &AMDGPU::SReg_32RegClass;
2704 Ty = LLT::scalar(32);
2705 break;
2707 Reg = &ClusterWorkGroupMaxFlatID;
2708 RC = &AMDGPU::SReg_32RegClass;
2709 Ty = LLT::scalar(32);
2710 break;
2711 default:
2712 break;
2713 }
2714 }
2715
2716 if (!Reg)
2717 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2718 if (!Reg) {
2720 // It's possible for a kernarg intrinsic call to appear in a kernel with
2721 // no allocated segment, in which case we do not add the user sgpr
2722 // argument, so just return null.
2723 return DAG.getConstant(0, SDLoc(), VT);
2724 }
2725
2726 // It's undefined behavior if a function marked with the amdgpu-no-*
2727 // attributes uses the corresponding intrinsic.
2728 return DAG.getPOISON(VT);
2729 }
2730
2731 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2732}
2733
2735 CallingConv::ID CallConv,
2736 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2737 FunctionType *FType,
2738 SIMachineFunctionInfo *Info) {
2739 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2740 const ISD::InputArg *Arg = &Ins[I];
2741
2742 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2743 "vector type argument should have been split");
2744
2745 // First check if it's a PS input addr.
2746 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2747 PSInputNum <= 15) {
2748 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2749
2750 // Inconveniently only the first part of the split is marked as isSplit,
2751 // so skip to the end. We only want to increment PSInputNum once for the
2752 // entire split argument.
2753 if (Arg->Flags.isSplit()) {
2754 while (!Arg->Flags.isSplitEnd()) {
2755 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2756 "unexpected vector split in ps argument type");
2757 if (!SkipArg)
2758 Splits.push_back(*Arg);
2759 Arg = &Ins[++I];
2760 }
2761 }
2762
2763 if (SkipArg) {
2764 // We can safely skip PS inputs.
2765 Skipped.set(Arg->getOrigArgIndex());
2766 ++PSInputNum;
2767 continue;
2768 }
2769
2770 Info->markPSInputAllocated(PSInputNum);
2771 if (Arg->Used)
2772 Info->markPSInputEnabled(PSInputNum);
2773
2774 ++PSInputNum;
2775 }
2776
2777 Splits.push_back(*Arg);
2778 }
2779}
2780
2781// Allocate special inputs passed in VGPRs.
2783 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2784 SIMachineFunctionInfo &Info) const {
2785 const LLT S32 = LLT::scalar(32);
2786 MachineRegisterInfo &MRI = MF.getRegInfo();
2787
2788 if (Info.hasWorkItemIDX()) {
2789 Register Reg = AMDGPU::VGPR0;
2790 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2791
2792 CCInfo.AllocateReg(Reg);
2793 unsigned Mask =
2794 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2795 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2796 }
2797
2798 if (Info.hasWorkItemIDY()) {
2799 assert(Info.hasWorkItemIDX());
2800 if (Subtarget->hasPackedTID()) {
2801 Info.setWorkItemIDY(
2802 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2803 } else {
2804 unsigned Reg = AMDGPU::VGPR1;
2805 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2806
2807 CCInfo.AllocateReg(Reg);
2808 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2809 }
2810 }
2811
2812 if (Info.hasWorkItemIDZ()) {
2813 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2814 if (Subtarget->hasPackedTID()) {
2815 Info.setWorkItemIDZ(
2816 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2817 } else {
2818 unsigned Reg = AMDGPU::VGPR2;
2819 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2820
2821 CCInfo.AllocateReg(Reg);
2822 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2823 }
2824 }
2825}
2826
2827// Try to allocate a VGPR at the end of the argument list, or if no argument
2828// VGPRs are left allocating a stack slot.
2829// If \p Mask is is given it indicates bitfield position in the register.
2830// If \p Arg is given use it with new ]p Mask instead of allocating new.
2831static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2832 ArgDescriptor Arg = ArgDescriptor()) {
2833 if (Arg.isSet())
2834 return ArgDescriptor::createArg(Arg, Mask);
2835
2836 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2837 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2838 if (RegIdx == ArgVGPRs.size()) {
2839 // Spill to stack required.
2840 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2841
2842 return ArgDescriptor::createStack(Offset, Mask);
2843 }
2844
2845 unsigned Reg = ArgVGPRs[RegIdx];
2846 Reg = CCInfo.AllocateReg(Reg);
2847 assert(Reg != AMDGPU::NoRegister);
2848
2849 MachineFunction &MF = CCInfo.getMachineFunction();
2850 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2851 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2852 return ArgDescriptor::createRegister(Reg, Mask);
2853}
2854
2856 const TargetRegisterClass *RC,
2857 unsigned NumArgRegs) {
2858 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2859 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2860 if (RegIdx == ArgSGPRs.size())
2861 report_fatal_error("ran out of SGPRs for arguments");
2862
2863 unsigned Reg = ArgSGPRs[RegIdx];
2864 Reg = CCInfo.AllocateReg(Reg);
2865 assert(Reg != AMDGPU::NoRegister);
2866
2867 MachineFunction &MF = CCInfo.getMachineFunction();
2868 MF.addLiveIn(Reg, RC);
2870}
2871
2872// If this has a fixed position, we still should allocate the register in the
2873// CCInfo state. Technically we could get away with this for values passed
2874// outside of the normal argument range.
2876 const TargetRegisterClass *RC,
2877 MCRegister Reg) {
2878 Reg = CCInfo.AllocateReg(Reg);
2879 assert(Reg != AMDGPU::NoRegister);
2880 MachineFunction &MF = CCInfo.getMachineFunction();
2881 MF.addLiveIn(Reg, RC);
2882}
2883
2884static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2885 if (Arg) {
2886 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2887 Arg.getRegister());
2888 } else
2889 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2890}
2891
2892static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2893 if (Arg) {
2894 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2895 Arg.getRegister());
2896 } else
2897 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2898}
2899
2900/// Allocate implicit function VGPR arguments at the end of allocated user
2901/// arguments.
2903 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2904 SIMachineFunctionInfo &Info) const {
2905 const unsigned Mask = 0x3ff;
2906 ArgDescriptor Arg;
2907
2908 if (Info.hasWorkItemIDX()) {
2909 Arg = allocateVGPR32Input(CCInfo, Mask);
2910 Info.setWorkItemIDX(Arg);
2911 }
2912
2913 if (Info.hasWorkItemIDY()) {
2914 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2915 Info.setWorkItemIDY(Arg);
2916 }
2917
2918 if (Info.hasWorkItemIDZ())
2919 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2920}
2921
2922/// Allocate implicit function VGPR arguments in fixed registers.
2924 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2925 SIMachineFunctionInfo &Info) const {
2926 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2927 if (!Reg)
2928 report_fatal_error("failed to allocate VGPR for implicit arguments");
2929
2930 const unsigned Mask = 0x3ff;
2931 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2932 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2933 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2934}
2935
2937 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2938 SIMachineFunctionInfo &Info) const {
2939 auto &ArgInfo = Info.getArgInfo();
2940 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2941
2942 // TODO: Unify handling with private memory pointers.
2943 if (UserSGPRInfo.hasDispatchPtr())
2944 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2945
2946 if (UserSGPRInfo.hasQueuePtr())
2947 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2948
2949 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2950 // constant offset from the kernarg segment.
2951 if (Info.hasImplicitArgPtr())
2952 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2953
2954 if (UserSGPRInfo.hasDispatchID())
2955 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2956
2957 // flat_scratch_init is not applicable for non-kernel functions.
2958
2959 if (Info.hasWorkGroupIDX())
2960 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2961
2962 if (Info.hasWorkGroupIDY())
2963 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2964
2965 if (Info.hasWorkGroupIDZ())
2966 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2967
2968 if (Info.hasLDSKernelId())
2969 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2970}
2971
2972// Allocate special inputs passed in user SGPRs.
2974 MachineFunction &MF,
2975 const SIRegisterInfo &TRI,
2976 SIMachineFunctionInfo &Info) const {
2977 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2978 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2979 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2980 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2981 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2982 }
2983
2984 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2985 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2986 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2987 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2988 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2989 }
2990
2991 if (UserSGPRInfo.hasDispatchPtr()) {
2992 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2993 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2994 CCInfo.AllocateReg(DispatchPtrReg);
2995 }
2996
2997 if (UserSGPRInfo.hasQueuePtr()) {
2998 Register QueuePtrReg = Info.addQueuePtr(TRI);
2999 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
3000 CCInfo.AllocateReg(QueuePtrReg);
3001 }
3002
3003 if (UserSGPRInfo.hasKernargSegmentPtr()) {
3004 MachineRegisterInfo &MRI = MF.getRegInfo();
3005 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
3006 CCInfo.AllocateReg(InputPtrReg);
3007
3008 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
3010 }
3011
3012 if (UserSGPRInfo.hasDispatchID()) {
3013 Register DispatchIDReg = Info.addDispatchID(TRI);
3014 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3015 CCInfo.AllocateReg(DispatchIDReg);
3016 }
3017
3018 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
3019 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
3020 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3021 CCInfo.AllocateReg(FlatScratchInitReg);
3022 }
3023
3024 if (UserSGPRInfo.hasPrivateSegmentSize()) {
3025 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
3026 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3027 CCInfo.AllocateReg(PrivateSegmentSizeReg);
3028 }
3029
3030 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
3031 // these from the dispatch pointer.
3032}
3033
3034// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
3035// sequential starting from the first argument.
3037 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
3039 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
3040 Function &F = MF.getFunction();
3041 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3042 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
3043 bool InPreloadSequence = true;
3044 unsigned InIdx = 0;
3045 bool AlignedForImplictArgs = false;
3046 unsigned ImplicitArgOffset = 0;
3047 for (auto &Arg : F.args()) {
3048 if (!InPreloadSequence || !Arg.hasInRegAttr())
3049 break;
3050
3051 unsigned ArgIdx = Arg.getArgNo();
3052 // Don't preload non-original args or parts not in the current preload
3053 // sequence.
3054 if (InIdx < Ins.size() &&
3055 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3056 break;
3057
3058 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
3059 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3060 InIdx++) {
3061 assert(ArgLocs[ArgIdx].isMemLoc());
3062 auto &ArgLoc = ArgLocs[InIdx];
3063 const Align KernelArgBaseAlign = Align(16);
3064 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3065 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
3066 unsigned NumAllocSGPRs =
3067 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3068
3069 // Fix alignment for hidden arguments.
3070 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
3071 if (!AlignedForImplictArgs) {
3072 ImplicitArgOffset =
3073 alignTo(LastExplicitArgOffset,
3074 Subtarget->getAlignmentForImplicitArgPtr()) -
3075 LastExplicitArgOffset;
3076 AlignedForImplictArgs = true;
3077 }
3078 ArgOffset += ImplicitArgOffset;
3079 }
3080
3081 // Arg is preloaded into the previous SGPR.
3082 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3083 assert(InIdx >= 1 && "No previous SGPR");
3084 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3085 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3086 continue;
3087 }
3088
3089 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3090 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
3091 // Check for free user SGPRs for preloading.
3092 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3093 InPreloadSequence = false;
3094 break;
3095 }
3096
3097 // Preload this argument.
3098 const TargetRegisterClass *RC =
3099 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3100 SmallVectorImpl<MCRegister> *PreloadRegs =
3101 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3102
3103 if (PreloadRegs->size() > 1)
3104 RC = &AMDGPU::SGPR_32RegClass;
3105 for (auto &Reg : *PreloadRegs) {
3106 assert(Reg);
3107 MF.addLiveIn(Reg, RC);
3108 CCInfo.AllocateReg(Reg);
3109 }
3110
3111 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3112 }
3113 }
3114}
3115
3117 const SIRegisterInfo &TRI,
3118 SIMachineFunctionInfo &Info) const {
3119 // Always allocate this last since it is a synthetic preload.
3120 if (Info.hasLDSKernelId()) {
3121 Register Reg = Info.addLDSKernelId();
3122 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3123 CCInfo.AllocateReg(Reg);
3124 }
3125}
3126
3127// Allocate special input registers that are initialized per-wave.
3130 CallingConv::ID CallConv,
3131 bool IsShader) const {
3132 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3133 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3134 // Note: user SGPRs are handled by the front-end for graphics shaders
3135 // Pad up the used user SGPRs with dead inputs.
3136
3137 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3138 // before enabling architected SGPRs for workgroup IDs.
3139 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3140
3141 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3142 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3143 // rely on it to reach 16 since if we end up having no stack usage, it will
3144 // not really be added.
3145 unsigned NumRequiredSystemSGPRs =
3146 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3147 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3148 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3149 Register Reg = Info.addReservedUserSGPR();
3150 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3151 CCInfo.AllocateReg(Reg);
3152 }
3153 }
3154
3155 if (!HasArchitectedSGPRs) {
3156 if (Info.hasWorkGroupIDX()) {
3157 Register Reg = Info.addWorkGroupIDX();
3158 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3159 CCInfo.AllocateReg(Reg);
3160 }
3161
3162 if (Info.hasWorkGroupIDY()) {
3163 Register Reg = Info.addWorkGroupIDY();
3164 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3165 CCInfo.AllocateReg(Reg);
3166 }
3167
3168 if (Info.hasWorkGroupIDZ()) {
3169 Register Reg = Info.addWorkGroupIDZ();
3170 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3171 CCInfo.AllocateReg(Reg);
3172 }
3173 }
3174
3175 if (Info.hasWorkGroupInfo()) {
3176 Register Reg = Info.addWorkGroupInfo();
3177 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3178 CCInfo.AllocateReg(Reg);
3179 }
3180
3181 if (Info.hasPrivateSegmentWaveByteOffset()) {
3182 // Scratch wave offset passed in system SGPR.
3183 unsigned PrivateSegmentWaveByteOffsetReg;
3184
3185 if (IsShader) {
3186 PrivateSegmentWaveByteOffsetReg =
3187 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3188
3189 // This is true if the scratch wave byte offset doesn't have a fixed
3190 // location.
3191 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3192 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3193 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3194 }
3195 } else
3196 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3197
3198 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3199 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3200 }
3201
3202 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3203 Info.getNumPreloadedSGPRs() >= 16);
3204}
3205
3207 MachineFunction &MF,
3208 const SIRegisterInfo &TRI,
3209 SIMachineFunctionInfo &Info) {
3210 // Now that we've figured out where the scratch register inputs are, see if
3211 // should reserve the arguments and use them directly.
3212 MachineFrameInfo &MFI = MF.getFrameInfo();
3213 bool HasStackObjects = MFI.hasStackObjects();
3214 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3215
3216 // Record that we know we have non-spill stack objects so we don't need to
3217 // check all stack objects later.
3218 if (HasStackObjects)
3219 Info.setHasNonSpillStackObjects(true);
3220
3221 // Everything live out of a block is spilled with fast regalloc, so it's
3222 // almost certain that spilling will be required.
3224 HasStackObjects = true;
3225
3226 // For now assume stack access is needed in any callee functions, so we need
3227 // the scratch registers to pass in.
3228 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3229
3230 if (!ST.hasFlatScratchEnabled()) {
3231 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3232 // If we have stack objects, we unquestionably need the private buffer
3233 // resource. For the Code Object V2 ABI, this will be the first 4 user
3234 // SGPR inputs. We can reserve those and use them directly.
3235
3236 Register PrivateSegmentBufferReg =
3238 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3239 } else {
3240 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3241 // We tentatively reserve the last registers (skipping the last registers
3242 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3243 // we'll replace these with the ones immediately after those which were
3244 // really allocated. In the prologue copies will be inserted from the
3245 // argument to these reserved registers.
3246
3247 // Without HSA, relocations are used for the scratch pointer and the
3248 // buffer resource setup is always inserted in the prologue. Scratch wave
3249 // offset is still in an input SGPR.
3250 Info.setScratchRSrcReg(ReservedBufferReg);
3251 }
3252 }
3253
3254 MachineRegisterInfo &MRI = MF.getRegInfo();
3255
3256 // For entry functions we have to set up the stack pointer if we use it,
3257 // whereas non-entry functions get this "for free". This means there is no
3258 // intrinsic advantage to using S32 over S34 in cases where we do not have
3259 // calls but do need a frame pointer (i.e. if we are requested to have one
3260 // because frame pointer elimination is disabled). To keep things simple we
3261 // only ever use S32 as the call ABI stack pointer, and so using it does not
3262 // imply we need a separate frame pointer.
3263 //
3264 // Try to use s32 as the SP, but move it if it would interfere with input
3265 // arguments. This won't work with calls though.
3266 //
3267 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3268 // registers.
3269 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3270 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3271 } else {
3273
3274 if (MFI.hasCalls())
3275 report_fatal_error("call in graphics shader with too many input SGPRs");
3276
3277 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3278 if (!MRI.isLiveIn(Reg)) {
3279 Info.setStackPtrOffsetReg(Reg);
3280 break;
3281 }
3282 }
3283
3284 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3285 report_fatal_error("failed to find register for SP");
3286 }
3287
3288 // hasFP should be accurate for entry functions even before the frame is
3289 // finalized, because it does not rely on the known stack size, only
3290 // properties like whether variable sized objects are present.
3291 if (ST.getFrameLowering()->hasFP(MF)) {
3292 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3293 }
3294}
3295
3298 return !Info->isEntryFunction();
3299}
3300
3302
3304 MachineBasicBlock *Entry,
3305 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3307
3308 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3309 if (!IStart)
3310 return;
3311
3312 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3313 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3314 MachineBasicBlock::iterator MBBI = Entry->begin();
3315 for (const MCPhysReg *I = IStart; *I; ++I) {
3316 const TargetRegisterClass *RC = nullptr;
3317 if (AMDGPU::SReg_64RegClass.contains(*I))
3318 RC = &AMDGPU::SGPR_64RegClass;
3319 else if (AMDGPU::SReg_32RegClass.contains(*I))
3320 RC = &AMDGPU::SGPR_32RegClass;
3321 else
3322 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3323
3324 Register NewVR = MRI->createVirtualRegister(RC);
3325 // Create copy from CSR to a virtual register.
3326 Entry->addLiveIn(*I);
3327 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3328 .addReg(*I);
3329
3330 // Insert the copy-back instructions right before the terminator.
3331 for (auto *Exit : Exits)
3332 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3333 TII->get(TargetOpcode::COPY), *I)
3334 .addReg(NewVR);
3335 }
3336}
3337
3339 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3340 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3341 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3343
3345 const Function &Fn = MF.getFunction();
3348 bool IsError = false;
3349
3350 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3352 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3353 IsError = true;
3354 }
3355
3358 BitVector Skipped(Ins.size());
3359 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3360 *DAG.getContext());
3361
3362 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3363 bool IsKernel = AMDGPU::isKernel(CallConv);
3364 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3365
3366 if (IsGraphics) {
3367 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3368 assert(!UserSGPRInfo.hasDispatchPtr() &&
3369 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3370 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3371 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3372 (void)UserSGPRInfo;
3373 if (!Subtarget->hasFlatScratchEnabled())
3374 assert(!UserSGPRInfo.hasFlatScratchInit());
3375 if ((CallConv != CallingConv::AMDGPU_CS &&
3376 CallConv != CallingConv::AMDGPU_Gfx &&
3377 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3378 !Subtarget->hasArchitectedSGPRs())
3379 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3380 !Info->hasWorkGroupIDZ());
3381 }
3382
3383 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3384
3385 if (CallConv == CallingConv::AMDGPU_PS) {
3386 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3387
3388 // At least one interpolation mode must be enabled or else the GPU will
3389 // hang.
3390 //
3391 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3392 // set PSInputAddr, the user wants to enable some bits after the compilation
3393 // based on run-time states. Since we can't know what the final PSInputEna
3394 // will look like, so we shouldn't do anything here and the user should take
3395 // responsibility for the correct programming.
3396 //
3397 // Otherwise, the following restrictions apply:
3398 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3399 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3400 // enabled too.
3401 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3402 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3403 CCInfo.AllocateReg(AMDGPU::VGPR0);
3404 CCInfo.AllocateReg(AMDGPU::VGPR1);
3405 Info->markPSInputAllocated(0);
3406 Info->markPSInputEnabled(0);
3407 }
3408 if (Subtarget->isAmdPalOS()) {
3409 // For isAmdPalOS, the user does not enable some bits after compilation
3410 // based on run-time states; the register values being generated here are
3411 // the final ones set in hardware. Therefore we need to apply the
3412 // workaround to PSInputAddr and PSInputEnable together. (The case where
3413 // a bit is set in PSInputAddr but not PSInputEnable is where the
3414 // frontend set up an input arg for a particular interpolation mode, but
3415 // nothing uses that input arg. Really we should have an earlier pass
3416 // that removes such an arg.)
3417 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3418 if ((PsInputBits & 0x7F) == 0 ||
3419 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3420 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3421 }
3422 } else if (IsKernel) {
3423 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3424 } else {
3425 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3426 Ins.end());
3427 }
3428
3429 if (IsKernel)
3430 analyzeFormalArgumentsCompute(CCInfo, Ins);
3431
3432 if (IsEntryFunc) {
3433 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3434 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3435 if (IsKernel && Subtarget->hasKernargPreload())
3436 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3437
3438 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3439 } else if (!IsGraphics) {
3440 // For the fixed ABI, pass workitem IDs in the last argument register.
3441 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3442
3443 // FIXME: Sink this into allocateSpecialInputSGPRs
3444 if (!Subtarget->hasFlatScratchEnabled())
3445 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3446
3447 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3448 }
3449
3450 if (!IsKernel) {
3451 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3452 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3453
3454 // This assumes the registers are allocated by CCInfo in ascending order
3455 // with no gaps.
3456 Info->setNumWaveDispatchSGPRs(
3457 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3458 Info->setNumWaveDispatchVGPRs(
3459 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3460 } else if (Info->getNumKernargPreloadedSGPRs()) {
3461 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3462 }
3463
3465
3466 if (IsWholeWaveFunc) {
3467 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3468 {MVT::i1, MVT::Other}, Chain);
3469 InVals.push_back(Setup.getValue(0));
3470 Chains.push_back(Setup.getValue(1));
3471 }
3472
3473 // FIXME: This is the minimum kernel argument alignment. We should improve
3474 // this to the maximum alignment of the arguments.
3475 //
3476 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3477 // kern arg offset.
3478 const Align KernelArgBaseAlign = Align(16);
3479
3480 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3481 ++i) {
3482 const ISD::InputArg &Arg = Ins[i];
3483 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3484 InVals.push_back(DAG.getPOISON(Arg.VT));
3485 continue;
3486 }
3487
3488 CCValAssign &VA = ArgLocs[ArgIdx++];
3489 MVT VT = VA.getLocVT();
3490
3491 if (IsEntryFunc && VA.isMemLoc()) {
3492 VT = Ins[i].VT;
3493 EVT MemVT = VA.getLocVT();
3494
3495 const uint64_t Offset = VA.getLocMemOffset();
3496 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3497
3498 if (Arg.Flags.isByRef()) {
3499 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3500
3501 const GCNTargetMachine &TM =
3502 static_cast<const GCNTargetMachine &>(getTargetMachine());
3503 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3504 Arg.Flags.getPointerAddrSpace())) {
3507 }
3508
3509 InVals.push_back(Ptr);
3510 continue;
3511 }
3512
3513 SDValue NewArg;
3514 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3515 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3516 // In this case the argument is packed into the previous preload SGPR.
3517 int64_t AlignDownOffset = alignDown(Offset, 4);
3518 int64_t OffsetDiff = Offset - AlignDownOffset;
3519 EVT IntVT = MemVT.changeTypeToInteger();
3520
3521 const SIMachineFunctionInfo *Info =
3524 Register Reg =
3525 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3526
3527 assert(Reg);
3528 Register VReg = MRI.getLiveInVirtReg(Reg);
3529 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3530
3531 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3532 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3533
3534 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3535 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3536 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3537 Ins[i].Flags.isSExt(), &Ins[i]);
3538
3539 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3540 } else {
3541 const SIMachineFunctionInfo *Info =
3544 const SmallVectorImpl<MCRegister> &PreloadRegs =
3545 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3546
3547 SDValue Copy;
3548 if (PreloadRegs.size() == 1) {
3549 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3550 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3551 NewArg = DAG.getCopyFromReg(
3552 Chain, DL, VReg,
3554 TRI->getRegSizeInBits(*RC)));
3555
3556 } else {
3557 // If the kernarg alignment does not match the alignment of the SGPR
3558 // tuple RC that can accommodate this argument, it will be built up
3559 // via copies from from the individual SGPRs that the argument was
3560 // preloaded to.
3562 for (auto Reg : PreloadRegs) {
3563 Register VReg = MRI.getLiveInVirtReg(Reg);
3564 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3565 Elts.push_back(Copy);
3566 }
3567 NewArg =
3568 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3569 PreloadRegs.size()),
3570 DL, Elts);
3571 }
3572
3573 // If the argument was preloaded to multiple consecutive 32-bit
3574 // registers because of misalignment between addressable SGPR tuples
3575 // and the argument size, we can still assume that because of kernarg
3576 // segment alignment restrictions that NewArg's size is the same as
3577 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3578 // truncate since we cannot preload to less than a single SGPR and the
3579 // MemVT may be smaller.
3580 EVT MemVTInt =
3582 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3583 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3584
3585 NewArg = DAG.getBitcast(MemVT, NewArg);
3586 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3587 Ins[i].Flags.isSExt(), &Ins[i]);
3588 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3589 }
3590 } else {
3591 // Hidden arguments that are in the kernel signature must be preloaded
3592 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3593 // the argument list and is not preloaded.
3594 if (Arg.isOrigArg()) {
3595 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3596 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3598 *OrigArg->getParent(),
3599 "hidden argument in kernel signature was not preloaded",
3600 DL.getDebugLoc()));
3601 }
3602 }
3603
3604 NewArg =
3605 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3606 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3607 }
3608 Chains.push_back(NewArg.getValue(1));
3609
3610 auto *ParamTy =
3611 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3612 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3613 ParamTy &&
3614 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3615 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3616 // On SI local pointers are just offsets into LDS, so they are always
3617 // less than 16-bits. On CI and newer they could potentially be
3618 // real pointers, so we can't guarantee their size.
3619 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3620 DAG.getValueType(MVT::i16));
3621 }
3622
3623 InVals.push_back(NewArg);
3624 continue;
3625 }
3626 if (!IsEntryFunc && VA.isMemLoc()) {
3627 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3628 InVals.push_back(Val);
3629 if (!Arg.Flags.isByVal())
3630 Chains.push_back(Val.getValue(1));
3631 continue;
3632 }
3633
3634 assert(VA.isRegLoc() && "Parameter must be in a register!");
3635
3636 Register Reg = VA.getLocReg();
3637 const TargetRegisterClass *RC = nullptr;
3638 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3639 RC = &AMDGPU::VGPR_32RegClass;
3640 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3641 RC = &AMDGPU::SGPR_32RegClass;
3642 else
3643 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3644
3645 Reg = MF.addLiveIn(Reg, RC);
3646 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3647 if (Arg.Flags.isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3648 // FIXME: Need to forward the chains created by `CopyFromReg`s, make sure
3649 // they will read physical regs before any side effect instructions.
3650 SDValue ReadFirstLane =
3651 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3653 ReadFirstLane, Val);
3654 }
3655
3656 if (Arg.Flags.isSRet()) {
3657 // The return object should be reasonably addressable.
3658
3659 // FIXME: This helps when the return is a real sret. If it is a
3660 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3661 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3662 unsigned NumBits =
3664 Val = DAG.getNode(
3665 ISD::AssertZext, DL, VT, Val,
3666 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3667 }
3668
3669 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3670 InVals.push_back(Val);
3671 }
3672
3673 // Start adding system SGPRs.
3674 if (IsEntryFunc)
3675 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3676
3677 unsigned StackArgSize = CCInfo.getStackSize();
3678 Info->setBytesInStackArgArea(StackArgSize);
3679
3680 return Chains.empty() ? Chain
3681 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3682}
3683
3684// TODO: If return values can't fit in registers, we should return as many as
3685// possible in registers before passing on stack.
3687 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3688 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3689 const Type *RetTy) const {
3690 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3691 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3692 // for shaders. Vector types should be explicitly handled by CC.
3693 if (AMDGPU::isEntryFunctionCC(CallConv))
3694 return true;
3695
3697 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3698 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3699 return false;
3700
3701 // We must use the stack if return would require unavailable registers.
3702 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3703 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3704 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3705 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3706 return false;
3707
3708 return true;
3709}
3710
3711SDValue
3713 bool isVarArg,
3715 const SmallVectorImpl<SDValue> &OutVals,
3716 const SDLoc &DL, SelectionDAG &DAG) const {
3720
3721 if (AMDGPU::isKernel(CallConv)) {
3722 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3723 OutVals, DL, DAG);
3724 }
3725
3726 bool IsShader = AMDGPU::isShader(CallConv);
3727
3728 Info->setIfReturnsVoid(Outs.empty());
3729 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3730
3731 // CCValAssign - represent the assignment of the return value to a location.
3733
3734 // CCState - Info about the registers and stack slots.
3735 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3736 *DAG.getContext());
3737
3738 // Analyze outgoing return values.
3739 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3740
3741 SDValue Glue;
3743 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3744
3745 SDValue ReadFirstLane =
3746 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3747 // Copy the result values into the output registers.
3748 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3749 ++I, ++RealRVLocIdx) {
3750 CCValAssign &VA = RVLocs[I];
3751 assert(VA.isRegLoc() && "Can only return in registers!");
3752 // TODO: Partially return in registers if return values don't fit.
3753 SDValue Arg = OutVals[RealRVLocIdx];
3754
3755 // Copied from other backends.
3756 switch (VA.getLocInfo()) {
3757 case CCValAssign::Full:
3758 break;
3759 case CCValAssign::BCvt:
3760 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3761 break;
3762 case CCValAssign::SExt:
3763 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3764 break;
3765 case CCValAssign::ZExt:
3766 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3767 break;
3768 case CCValAssign::AExt:
3769 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3770 break;
3771 default:
3772 llvm_unreachable("Unknown loc info!");
3773 }
3774 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3776 ReadFirstLane, Arg);
3777 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3778 Glue = Chain.getValue(1);
3779 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3780 }
3781
3782 // FIXME: Does sret work properly?
3783 if (!Info->isEntryFunction()) {
3784 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3785 const MCPhysReg *I =
3786 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3787 if (I) {
3788 for (; *I; ++I) {
3789 if (AMDGPU::SReg_64RegClass.contains(*I))
3790 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3791 else if (AMDGPU::SReg_32RegClass.contains(*I))
3792 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3793 else
3794 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3795 }
3796 }
3797 }
3798
3799 // Update chain and glue.
3800 RetOps[0] = Chain;
3801 if (Glue.getNode())
3802 RetOps.push_back(Glue);
3803
3804 unsigned Opc = AMDGPUISD::ENDPGM;
3805 if (!IsWaveEnd)
3806 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3807 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3808 : AMDGPUISD::RET_GLUE;
3809 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3810}
3811
3813 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3814 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3815 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3816 SDValue ThisVal) const {
3817 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3818
3819 // Assign locations to each value returned by this call.
3821 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3822 *DAG.getContext());
3823 CCInfo.AnalyzeCallResult(Ins, RetCC);
3824
3825 // Copy all of the result registers out of their specified physreg.
3826 for (CCValAssign VA : RVLocs) {
3827 SDValue Val;
3828
3829 if (VA.isRegLoc()) {
3830 Val =
3831 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3832 Chain = Val.getValue(1);
3833 InGlue = Val.getValue(2);
3834 } else if (VA.isMemLoc()) {
3835 report_fatal_error("TODO: return values in memory");
3836 } else
3837 llvm_unreachable("unknown argument location type");
3838
3839 switch (VA.getLocInfo()) {
3840 case CCValAssign::Full:
3841 break;
3842 case CCValAssign::BCvt:
3843 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3844 break;
3845 case CCValAssign::ZExt:
3846 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3847 DAG.getValueType(VA.getValVT()));
3848 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3849 break;
3850 case CCValAssign::SExt:
3851 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3852 DAG.getValueType(VA.getValVT()));
3853 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3854 break;
3855 case CCValAssign::AExt:
3856 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3857 break;
3858 default:
3859 llvm_unreachable("Unknown loc info!");
3860 }
3861
3862 InVals.push_back(Val);
3863 }
3864
3865 return Chain;
3866}
3867
3868// Add code to pass special inputs required depending on used features separate
3869// from the explicit user arguments present in the IR.
3871 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3872 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3873 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3874 // If we don't have a call site, this was a call inserted by
3875 // legalization. These can never use special inputs.
3876 if (!CLI.CB)
3877 return;
3878
3879 SelectionDAG &DAG = CLI.DAG;
3880 const SDLoc &DL = CLI.DL;
3881 const Function &F = DAG.getMachineFunction().getFunction();
3882
3883 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3884 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3885
3886 const AMDGPUFunctionArgInfo &CalleeArgInfo =
3888
3889 // TODO: Unify with private memory register handling. This is complicated by
3890 // the fact that at least in kernels, the input argument is not necessarily
3891 // in the same location as the input.
3892 // clang-format off
3893 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3894 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3895 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3896 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3897 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3898 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3899 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3900 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3901 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3902 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3903 };
3904 // clang-format on
3905
3906 for (auto [InputID, Attrs] : ImplicitAttrs) {
3907 // If the callee does not use the attribute value, skip copying the value.
3908 if (all_of(Attrs, [&](StringRef Attr) {
3909 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3910 }))
3911 continue;
3912
3913 const auto [OutgoingArg, ArgRC, ArgTy] =
3914 CalleeArgInfo.getPreloadedValue(InputID);
3915 if (!OutgoingArg)
3916 continue;
3917
3918 const auto [IncomingArg, IncomingArgRC, Ty] =
3919 CallerArgInfo.getPreloadedValue(InputID);
3920 assert(IncomingArgRC == ArgRC);
3921
3922 // All special arguments are ints for now.
3923 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3924 SDValue InputReg;
3925
3926 if (IncomingArg) {
3927 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3928 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3929 // The implicit arg ptr is special because it doesn't have a corresponding
3930 // input for kernels, and is computed from the kernarg segment pointer.
3931 InputReg = getImplicitArgPtr(DAG, DL);
3932 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3933 std::optional<uint32_t> Id =
3935 if (Id.has_value()) {
3936 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3937 } else {
3938 InputReg = DAG.getPOISON(ArgVT);
3939 }
3940 } else {
3941 // We may have proven the input wasn't needed, although the ABI is
3942 // requiring it. We just need to allocate the register appropriately.
3943 InputReg = DAG.getPOISON(ArgVT);
3944 }
3945
3946 if (OutgoingArg->isRegister()) {
3947 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3948 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3949 report_fatal_error("failed to allocate implicit input argument");
3950 } else {
3951 unsigned SpecialArgOffset =
3952 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3953 SDValue ArgStore =
3954 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3955 MemOpChains.push_back(ArgStore);
3956 }
3957 }
3958
3959 // Pack workitem IDs into a single register or pass it as is if already
3960 // packed.
3961
3962 auto [OutgoingArg, ArgRC, Ty] =
3964 if (!OutgoingArg)
3965 std::tie(OutgoingArg, ArgRC, Ty) =
3967 if (!OutgoingArg)
3968 std::tie(OutgoingArg, ArgRC, Ty) =
3970 if (!OutgoingArg)
3971 return;
3972
3973 const ArgDescriptor *IncomingArgX = std::get<0>(
3975 const ArgDescriptor *IncomingArgY = std::get<0>(
3977 const ArgDescriptor *IncomingArgZ = std::get<0>(
3979
3980 SDValue InputReg;
3981 SDLoc SL;
3982
3983 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3984 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3985 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3986
3987 // If incoming ids are not packed we need to pack them.
3988 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
3989 NeedWorkItemIDX) {
3990 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3991 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3992 } else {
3993 InputReg = DAG.getConstant(0, DL, MVT::i32);
3994 }
3995 }
3996
3997 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
3998 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3999 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
4000 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
4001 DAG.getShiftAmountConstant(10, MVT::i32, SL));
4002 InputReg = InputReg.getNode()
4003 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
4004 : Y;
4005 }
4006
4007 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
4008 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
4009 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
4010 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
4011 DAG.getShiftAmountConstant(20, MVT::i32, SL));
4012 InputReg = InputReg.getNode()
4013 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
4014 : Z;
4015 }
4016
4017 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4018 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4019 // We're in a situation where the outgoing function requires the workitem
4020 // ID, but the calling function does not have it (e.g a graphics function
4021 // calling a C calling convention function). This is illegal, but we need
4022 // to produce something.
4023 InputReg = DAG.getPOISON(MVT::i32);
4024 } else {
4025 // Workitem ids are already packed, any of present incoming arguments
4026 // will carry all required fields.
4027 ArgDescriptor IncomingArg =
4028 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
4029 : IncomingArgY ? *IncomingArgY
4030 : *IncomingArgZ,
4031 ~0u);
4032 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
4033 }
4034 }
4035
4036 if (OutgoingArg->isRegister()) {
4037 if (InputReg)
4038 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4039
4040 CCInfo.AllocateReg(OutgoingArg->getRegister());
4041 } else {
4042 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
4043 if (InputReg) {
4044 SDValue ArgStore =
4045 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
4046 MemOpChains.push_back(ArgStore);
4047 }
4048 }
4049}
4050
4052 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4054 const SmallVectorImpl<SDValue> &OutVals,
4055 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4056 if (AMDGPU::isChainCC(CalleeCC))
4057 return true;
4058
4059 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
4060 return false;
4061
4062 // For a divergent call target, we need to do a waterfall loop over the
4063 // possible callees which precludes us from using a simple jump.
4064 if (Callee->isDivergent())
4065 return false;
4066
4068 const Function &CallerF = MF.getFunction();
4069 CallingConv::ID CallerCC = CallerF.getCallingConv();
4071 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4072
4073 // Kernels aren't callable, and don't have a live in return address so it
4074 // doesn't make sense to do a tail call with entry functions.
4075 if (!CallerPreserved)
4076 return false;
4077
4078 bool CCMatch = CallerCC == CalleeCC;
4079
4081 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
4082 return true;
4083 return false;
4084 }
4085
4086 // TODO: Can we handle var args?
4087 if (IsVarArg)
4088 return false;
4089
4090 for (const Argument &Arg : CallerF.args()) {
4091 if (Arg.hasByValAttr())
4092 return false;
4093 }
4094
4095 LLVMContext &Ctx = *DAG.getContext();
4096
4097 // Check that the call results are passed in the same way.
4098 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4099 CCAssignFnForCall(CalleeCC, IsVarArg),
4100 CCAssignFnForCall(CallerCC, IsVarArg)))
4101 return false;
4102
4103 // The callee has to preserve all registers the caller needs to preserve.
4104 if (!CCMatch) {
4105 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4106 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4107 return false;
4108 }
4109
4110 // Nothing more to check if the callee is taking no arguments.
4111 if (Outs.empty())
4112 return true;
4113
4115 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4116
4117 // FIXME: We are not allocating special input registers, so we will be
4118 // deciding based on incorrect register assignments.
4119 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4120
4121 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4122 // If the stack arguments for this call do not fit into our own save area then
4123 // the call cannot be made tail.
4124 // TODO: Is this really necessary?
4125 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4126 return false;
4127
4128 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4129 // FIXME: What about inreg arguments that end up passed in memory?
4130 if (!CCVA.isRegLoc())
4131 continue;
4132
4133 // If we are passing an argument in an SGPR, and the value is divergent,
4134 // this call requires a waterfall loop.
4135 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4136 LLVM_DEBUG(
4137 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4138 << printReg(CCVA.getLocReg(), TRI) << '\n');
4139 return false;
4140 }
4141 }
4142
4143 const MachineRegisterInfo &MRI = MF.getRegInfo();
4144 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4145}
4146
4148 if (!CI->isTailCall())
4149 return false;
4150
4151 const Function *ParentFn = CI->getFunction();
4153 return false;
4154 return true;
4155}
4156
4157namespace {
4158// Chain calls have special arguments that we need to handle. These are
4159// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4160// arguments (index 0 and 1 respectively).
4161enum ChainCallArgIdx {
4162 Exec = 2,
4163 Flags,
4164 NumVGPRs,
4165 FallbackExec,
4166 FallbackCallee
4167};
4168} // anonymous namespace
4169
4170// The wave scratch offset register is used as the global base pointer.
4172 SmallVectorImpl<SDValue> &InVals) const {
4173 CallingConv::ID CallConv = CLI.CallConv;
4174 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4175
4176 SelectionDAG &DAG = CLI.DAG;
4177
4178 const SDLoc &DL = CLI.DL;
4179 SDValue Chain = CLI.Chain;
4180 SDValue Callee = CLI.Callee;
4181
4182 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4183 bool UsesDynamicVGPRs = false;
4184 if (IsChainCallConv) {
4185 // The last arguments should be the value that we need to put in EXEC,
4186 // followed by the flags and any other arguments with special meanings.
4187 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4188 // we don't treat them like the "real" arguments.
4189 auto RequestedExecIt =
4190 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4191 return Arg.OrigArgIndex == 2;
4192 });
4193 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4194
4195 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4196 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4197 CLI.OutVals.end());
4198 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4199
4200 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4201 "Haven't popped all the special args");
4202
4203 TargetLowering::ArgListEntry RequestedExecArg =
4204 CLI.Args[ChainCallArgIdx::Exec];
4205 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4206 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4207
4208 // Convert constants into TargetConstants, so they become immediate operands
4209 // instead of being selected into S_MOV.
4210 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4211 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4212 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4213 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4214 } else
4215 ChainCallSpecialArgs.push_back(Arg.Node);
4216 };
4217
4218 PushNodeOrTargetConstant(RequestedExecArg);
4219
4220 // Process any other special arguments depending on the value of the flags.
4221 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4222
4223 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4224 if (FlagsValue.isZero()) {
4225 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4226 return lowerUnhandledCall(CLI, InVals,
4227 "no additional args allowed if flags == 0");
4228 } else if (FlagsValue.isOneBitSet(0)) {
4229 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4230 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4231 }
4232
4233 if (!Subtarget->isWave32()) {
4234 return lowerUnhandledCall(
4235 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4236 }
4237
4238 UsesDynamicVGPRs = true;
4239 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4240 CLI.Args.end(), PushNodeOrTargetConstant);
4241 }
4242 }
4243
4245 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4247 bool &IsTailCall = CLI.IsTailCall;
4248 bool IsVarArg = CLI.IsVarArg;
4249 bool IsSibCall = false;
4251
4252 if (Callee.isUndef() || isNullConstant(Callee)) {
4253 if (!CLI.IsTailCall) {
4254 for (ISD::InputArg &Arg : CLI.Ins)
4255 InVals.push_back(DAG.getPOISON(Arg.VT));
4256 }
4257
4258 return Chain;
4259 }
4260
4261 if (IsVarArg) {
4262 return lowerUnhandledCall(CLI, InVals,
4263 "unsupported call to variadic function ");
4264 }
4265
4266 if (!CLI.CB)
4267 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4268
4269 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4270 return lowerUnhandledCall(CLI, InVals,
4271 "unsupported required tail call to function ");
4272 }
4273
4274 if (IsTailCall) {
4275 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4276 Outs, OutVals, Ins, DAG);
4277 if (!IsTailCall &&
4278 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4279 report_fatal_error("failed to perform tail call elimination on a call "
4280 "site marked musttail or on llvm.amdgcn.cs.chain");
4281 }
4282
4283 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4284
4285 // A sibling call is one where we're under the usual C ABI and not planning
4286 // to change that but can still do a tail call:
4287 if (!TailCallOpt && IsTailCall)
4288 IsSibCall = true;
4289
4290 if (IsTailCall)
4291 ++NumTailCalls;
4292 }
4293
4296 SmallVector<SDValue, 8> MemOpChains;
4297
4298 // Analyze operands of the call, assigning locations to each operand.
4300 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4301 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4302
4303 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4305 // With a fixed ABI, allocate fixed registers before user arguments.
4306 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4307 }
4308
4309 // Mark the scratch resource descriptor as allocated so the CC analysis
4310 // does not assign user arguments to these registers, matching the callee.
4311 if (!Subtarget->hasFlatScratchEnabled())
4312 CCInfo.AllocateReg(Info->getScratchRSrcReg());
4313
4314 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4315
4316 // Get a count of how many bytes are to be pushed on the stack.
4317 unsigned NumBytes = CCInfo.getStackSize();
4318
4319 if (IsSibCall) {
4320 // Since we're not changing the ABI to make this a tail call, the memory
4321 // operands are already available in the caller's incoming argument space.
4322 NumBytes = 0;
4323 }
4324
4325 // FPDiff is the byte offset of the call's argument area from the callee's.
4326 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4327 // by this amount for a tail call. In a sibling call it must be 0 because the
4328 // caller will deallocate the entire stack and the callee still expects its
4329 // arguments to begin at SP+0. Completely unused for non-tail calls.
4330 int32_t FPDiff = 0;
4331 MachineFrameInfo &MFI = MF.getFrameInfo();
4332 auto *TRI = Subtarget->getRegisterInfo();
4333
4334 // Adjust the stack pointer for the new arguments...
4335 // These operations are automatically eliminated by the prolog/epilog pass
4336 if (!IsSibCall)
4337 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4338
4339 if (!IsSibCall || IsChainCallConv) {
4340 if (!Subtarget->hasFlatScratchEnabled()) {
4341 SmallVector<SDValue, 4> CopyFromChains;
4342
4343 // In the HSA case, this should be an identity copy.
4344 SDValue ScratchRSrcReg =
4345 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4346 RegsToPass.emplace_back(IsChainCallConv
4347 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4348 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4349 ScratchRSrcReg);
4350 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4351 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4352 }
4353 }
4354
4355 const unsigned NumSpecialInputs = RegsToPass.size();
4356
4357 MVT PtrVT = MVT::i32;
4358
4359 // Walk the register/memloc assignments, inserting copies/loads.
4360 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4361 CCValAssign &VA = ArgLocs[i];
4362 SDValue Arg = OutVals[i];
4363
4364 // Promote the value if needed.
4365 switch (VA.getLocInfo()) {
4366 case CCValAssign::Full:
4367 break;
4368 case CCValAssign::BCvt:
4369 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4370 break;
4371 case CCValAssign::ZExt:
4372 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4373 break;
4374 case CCValAssign::SExt:
4375 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4376 break;
4377 case CCValAssign::AExt:
4378 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4379 break;
4380 case CCValAssign::FPExt:
4381 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4382 break;
4383 default:
4384 llvm_unreachable("Unknown loc info!");
4385 }
4386
4387 if (VA.isRegLoc()) {
4388 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4389 } else {
4390 assert(VA.isMemLoc());
4391
4392 SDValue DstAddr;
4393 MachinePointerInfo DstInfo;
4394
4395 unsigned LocMemOffset = VA.getLocMemOffset();
4396 int32_t Offset = LocMemOffset;
4397
4398 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4399 MaybeAlign Alignment;
4400
4401 if (IsTailCall) {
4402 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4403 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4404 : VA.getValVT().getStoreSize();
4405
4406 // FIXME: We can have better than the minimum byval required alignment.
4407 Alignment =
4408 Flags.isByVal()
4409 ? Flags.getNonZeroByValAlign()
4410 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4411
4412 Offset = Offset + FPDiff;
4413 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4414
4415 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4416 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4417
4418 // Make sure any stack arguments overlapping with where we're storing
4419 // are loaded before this eventual operation. Otherwise they'll be
4420 // clobbered.
4421
4422 // FIXME: Why is this really necessary? This seems to just result in a
4423 // lot of code to copy the stack and write them back to the same
4424 // locations, which are supposed to be immutable?
4425 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4426 } else {
4427 // Stores to the argument stack area are relative to the stack pointer.
4428 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4429 MVT::i32);
4430 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4431 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4432 Alignment =
4433 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4434 }
4435
4436 if (Outs[i].Flags.isByVal()) {
4437 SDValue SizeNode =
4438 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4439 SDValue Cpy =
4440 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4441 Outs[i].Flags.getNonZeroByValAlign(),
4442 /*isVol = */ false, /*AlwaysInline = */ true,
4443 /*CI=*/nullptr, std::nullopt, DstInfo,
4445
4446 MemOpChains.push_back(Cpy);
4447 } else {
4448 SDValue Store =
4449 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4450 MemOpChains.push_back(Store);
4451 }
4452 }
4453 }
4454
4455 if (!MemOpChains.empty())
4456 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4457
4458 SDValue ReadFirstLaneID =
4459 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4460
4461 SDValue TokenGlue;
4462 if (CLI.ConvergenceControlToken) {
4463 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4465 }
4466
4467 // Build a sequence of copy-to-reg nodes chained together with token chain
4468 // and flag operands which copy the outgoing args into the appropriate regs.
4469 SDValue InGlue;
4470
4471 unsigned ArgIdx = 0;
4472 for (auto [Reg, Val] : RegsToPass) {
4473 if (ArgIdx++ >= NumSpecialInputs &&
4474 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4475 // For chain calls, the inreg arguments are required to be
4476 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4477 // they are uniform.
4478 //
4479 // For other calls, if an inreg arguments is known to be uniform,
4480 // speculatively insert a readfirstlane in case it is in a VGPR.
4481 //
4482 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4483 // value, so let that continue to produce invalid code.
4484
4485 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4486 if (TokenGlue)
4487 ReadfirstlaneArgs.push_back(TokenGlue);
4489 ReadfirstlaneArgs);
4490 }
4491
4492 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4493 InGlue = Chain.getValue(1);
4494 }
4495
4496 // We don't usually want to end the call-sequence here because we would tidy
4497 // the frame up *after* the call, however in the ABI-changing tail-call case
4498 // we've carefully laid out the parameters so that when sp is reset they'll be
4499 // in the correct location.
4500 if (IsTailCall && !IsSibCall) {
4501 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4502 InGlue = Chain.getValue(1);
4503 }
4504
4505 std::vector<SDValue> Ops({Chain});
4506
4507 // Add a redundant copy of the callee global which will not be legalized, as
4508 // we need direct access to the callee later.
4510 const GlobalValue *GV = GSD->getGlobal();
4511 Ops.push_back(Callee);
4512 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4513 } else {
4514 if (IsTailCall) {
4515 // isEligibleForTailCallOptimization considered whether the call target is
4516 // divergent, but we may still end up with a uniform value in a VGPR.
4517 // Insert a readfirstlane just in case.
4518 SDValue ReadFirstLaneID =
4519 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4520
4521 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4522 if (TokenGlue)
4523 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4524 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4525 ReadfirstlaneArgs);
4526 }
4527
4528 Ops.push_back(Callee);
4529 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4530 }
4531
4532 if (IsTailCall) {
4533 // Each tail call may have to adjust the stack by a different amount, so
4534 // this information must travel along with the operation for eventual
4535 // consumption by emitEpilogue.
4536 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4537 }
4538
4539 if (IsChainCallConv)
4540 llvm::append_range(Ops, ChainCallSpecialArgs);
4541
4542 // Add argument registers to the end of the list so that they are known live
4543 // into the call.
4544 for (auto &[Reg, Val] : RegsToPass)
4545 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4546
4547 // Add a register mask operand representing the call-preserved registers.
4548 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4549 assert(Mask && "Missing call preserved mask for calling convention");
4550 Ops.push_back(DAG.getRegisterMask(Mask));
4551
4552 if (SDValue Token = CLI.ConvergenceControlToken) {
4554 GlueOps.push_back(Token);
4555 if (InGlue)
4556 GlueOps.push_back(InGlue);
4557
4558 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4559 MVT::Glue, GlueOps),
4560 0);
4561 }
4562
4563 if (InGlue)
4564 Ops.push_back(InGlue);
4565
4566 // If we're doing a tall call, use a TC_RETURN here rather than an
4567 // actual call instruction.
4568 if (IsTailCall) {
4569 MFI.setHasTailCall();
4570 unsigned OPC = AMDGPUISD::TC_RETURN;
4571 switch (CallConv) {
4573 OPC = AMDGPUISD::TC_RETURN_GFX;
4574 break;
4577 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4578 : AMDGPUISD::TC_RETURN_CHAIN;
4579 break;
4580 }
4581
4582 // If the caller is a whole wave function, we need to use a special opcode
4583 // so we can patch up EXEC.
4584 if (Info->isWholeWaveFunction())
4585 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4586
4587 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4588 }
4589
4590 // Returns a chain and a flag for retval copy to use.
4591 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4592 Chain = Call.getValue(0);
4593 InGlue = Call.getValue(1);
4594
4595 uint64_t CalleePopBytes = NumBytes;
4596 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4597 if (!Ins.empty())
4598 InGlue = Chain.getValue(1);
4599
4600 // Handle result values, copying them out of physregs into vregs that we
4601 // return.
4602 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4603 InVals, /*IsThisReturn=*/false, SDValue());
4604}
4605
4606// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4607// except for:
4608// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4609// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4611 SelectionDAG &DAG) const {
4612 const MachineFunction &MF = DAG.getMachineFunction();
4614
4615 SDLoc dl(Op);
4616 EVT VT = Op.getValueType();
4617 SDValue Chain = Op.getOperand(0);
4618 Register SPReg = Info->getStackPtrOffsetReg();
4619
4620 // Chain the dynamic stack allocation so that it doesn't modify the stack
4621 // pointer when other instructions are using the stack.
4622 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4623
4624 SDValue Size = Op.getOperand(1);
4625 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4626 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4627
4628 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4630 "Stack grows upwards for AMDGPU");
4631
4632 Chain = BaseAddr.getValue(1);
4633 Align StackAlign = TFL->getStackAlign();
4634 if (Alignment > StackAlign) {
4635 uint64_t ScaledAlignment = Alignment.value()
4636 << Subtarget->getWavefrontSizeLog2();
4637 uint64_t StackAlignMask = ScaledAlignment - 1;
4638 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4639 DAG.getConstant(StackAlignMask, dl, VT));
4640 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4641 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4642 }
4643
4644 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4645 SDValue NewSP;
4647 // For constant sized alloca, scale alloca size by wave-size
4648 SDValue ScaledSize = DAG.getNode(
4649 ISD::SHL, dl, VT, Size,
4650 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4651 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4652 } else {
4653 // For dynamic sized alloca, perform wave-wide reduction to get max of
4654 // alloca size(divergent) and then scale it by wave-size
4655 SDValue WaveReduction =
4656 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4657 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4658 Size, DAG.getTargetConstant(0, dl, MVT::i32));
4659 SDValue ScaledSize = DAG.getNode(
4660 ISD::SHL, dl, VT, Size,
4661 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4662 NewSP =
4663 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4664 SDValue ReadFirstLaneID =
4665 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4666 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4667 NewSP);
4668 }
4669
4670 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4671 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4672
4673 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4674}
4675
4677 if (Op.getValueType() != MVT::i32)
4678 return Op; // Defer to cannot select error.
4679
4681 SDLoc SL(Op);
4682
4683 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4684
4685 // Convert from wave uniform to swizzled vector address. This should protect
4686 // from any edge cases where the stacksave result isn't directly used with
4687 // stackrestore.
4688 SDValue VectorAddress =
4689 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4690 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4691}
4692
4694 SelectionDAG &DAG) const {
4695 SDLoc SL(Op);
4696 assert(Op.getValueType() == MVT::i32);
4697
4698 uint32_t BothRoundHwReg =
4700 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4701
4702 SDValue IntrinID =
4703 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4704 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4705 Op.getOperand(0), IntrinID, GetRoundBothImm);
4706
4707 // There are two rounding modes, one for f32 and one for f64/f16. We only
4708 // report in the standard value range if both are the same.
4709 //
4710 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4711 // ties away from zero is not supported, and the other values are rotated by
4712 // 1.
4713 //
4714 // If the two rounding modes are not the same, report a target defined value.
4715
4716 // Mode register rounding mode fields:
4717 //
4718 // [1:0] Single-precision round mode.
4719 // [3:2] Double/Half-precision round mode.
4720 //
4721 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4722 //
4723 // Hardware Spec
4724 // Toward-0 3 0
4725 // Nearest Even 0 1
4726 // +Inf 1 2
4727 // -Inf 2 3
4728 // NearestAway0 N/A 4
4729 //
4730 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4731 // table we can index by the raw hardware mode.
4732 //
4733 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4734
4735 SDValue BitTable =
4737
4738 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4739 SDValue RoundModeTimesNumBits =
4740 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4741
4742 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4743 // knew only one mode was demanded.
4744 SDValue TableValue =
4745 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4746 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4747
4748 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4749 SDValue TableEntry =
4750 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4751
4752 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4753 // if it's an extended value.
4754 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4755 SDValue IsStandardValue =
4756 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4757 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4758 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4759 TableEntry, EnumOffset);
4760
4761 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4762}
4763
4765 SelectionDAG &DAG) const {
4766 SDLoc SL(Op);
4767
4768 SDValue NewMode = Op.getOperand(1);
4769 assert(NewMode.getValueType() == MVT::i32);
4770
4771 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4772 // hardware MODE.fp_round values.
4773 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4774 uint32_t ClampedVal = std::min(
4775 static_cast<uint32_t>(ConstMode->getZExtValue()),
4777 NewMode = DAG.getConstant(
4778 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4779 } else {
4780 // If we know the input can only be one of the supported standard modes in
4781 // the range 0-3, we can use a simplified mapping to hardware values.
4782 KnownBits KB = DAG.computeKnownBits(NewMode);
4783 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4784 // The supported standard values are 0-3. The extended values start at 8. We
4785 // need to offset by 4 if the value is in the extended range.
4786
4787 if (UseReducedTable) {
4788 // Truncate to the low 32-bits.
4789 SDValue BitTable = DAG.getConstant(
4790 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4791
4792 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4793 SDValue RoundModeTimesNumBits =
4794 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4795
4796 NewMode =
4797 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4798
4799 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4800 // the table extracted bits into inline immediates.
4801 } else {
4802 // table_index = umin(value, value - 4)
4803 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4804 SDValue BitTable =
4806
4807 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4808 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4809 SDValue IndexVal =
4810 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4811
4812 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4813 SDValue RoundModeTimesNumBits =
4814 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4815
4816 SDValue TableValue =
4817 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4818 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4819
4820 // No need to mask out the high bits since the setreg will ignore them
4821 // anyway.
4822 NewMode = TruncTable;
4823 }
4824
4825 // Insert a readfirstlane in case the value is a VGPR. We could do this
4826 // earlier and keep more operations scalar, but that interferes with
4827 // combining the source.
4828 SDValue ReadFirstLaneID =
4829 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4830 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4831 ReadFirstLaneID, NewMode);
4832 }
4833
4834 // N.B. The setreg will be later folded into s_round_mode on supported
4835 // targets.
4836 SDValue IntrinID =
4837 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4838 uint32_t BothRoundHwReg =
4840 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4841
4842 SDValue SetReg =
4843 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4844 IntrinID, RoundBothImm, NewMode);
4845
4846 return SetReg;
4847}
4848
4850 if (Op->isDivergent() &&
4851 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4852 // Cannot do I$ prefetch with divergent pointer.
4853 return SDValue();
4854
4855 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4859 break;
4861 if (Subtarget->hasSafeSmemPrefetch())
4862 break;
4863 [[fallthrough]];
4864 default:
4865 return SDValue();
4866 }
4867
4868 // I$ prefetch
4869 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4870 return SDValue();
4871
4872 return Op;
4873}
4874
4875// Work around DAG legality rules only based on the result type.
4877 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4878 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4879 EVT SrcVT = Src.getValueType();
4880
4881 if (SrcVT.getScalarType() != MVT::bf16)
4882 return Op;
4883
4884 SDLoc SL(Op);
4885 SDValue BitCast =
4886 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4887
4888 EVT DstVT = Op.getValueType();
4889 if (IsStrict)
4890 llvm_unreachable("Need STRICT_BF16_TO_FP");
4891
4892 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4893}
4894
4896 SDLoc SL(Op);
4897 if (Op.getValueType() != MVT::i64)
4898 return Op;
4899
4900 uint32_t ModeHwReg =
4902 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4903 uint32_t TrapHwReg =
4905 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4906
4907 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4908 SDValue IntrinID =
4909 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4910 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4911 Op.getOperand(0), IntrinID, ModeHwRegImm);
4912 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4913 Op.getOperand(0), IntrinID, TrapHwRegImm);
4914 SDValue TokenReg =
4915 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4916 GetTrapReg.getValue(1));
4917
4918 SDValue CvtPtr =
4919 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4920 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4921
4922 return DAG.getMergeValues({Result, TokenReg}, SL);
4923}
4924
4926 SDLoc SL(Op);
4927 if (Op.getOperand(1).getValueType() != MVT::i64)
4928 return Op;
4929
4930 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4931 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4932 DAG.getConstant(0, SL, MVT::i32));
4933 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4934 DAG.getConstant(1, SL, MVT::i32));
4935
4936 SDValue ReadFirstLaneID =
4937 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4938 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4939 ReadFirstLaneID, NewModeReg);
4940 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4941 ReadFirstLaneID, NewTrapReg);
4942
4943 unsigned ModeHwReg =
4945 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4946 unsigned TrapHwReg =
4948 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4949
4950 SDValue IntrinID =
4951 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4952 SDValue SetModeReg =
4953 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4954 IntrinID, ModeHwRegImm, NewModeReg);
4955 SDValue SetTrapReg =
4956 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4957 IntrinID, TrapHwRegImm, NewTrapReg);
4958 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4959}
4960
4962 const MachineFunction &MF) const {
4963 const Function &Fn = MF.getFunction();
4964
4966 .Case("m0", AMDGPU::M0)
4967 .Case("exec", AMDGPU::EXEC)
4968 .Case("exec_lo", AMDGPU::EXEC_LO)
4969 .Case("exec_hi", AMDGPU::EXEC_HI)
4970 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4971 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4972 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4973 .Default(Register());
4974 if (!Reg)
4975 return Reg;
4976
4977 if (!Subtarget->hasFlatScrRegister() &&
4978 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4979 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4980 "\" for subtarget."));
4981 }
4982
4983 switch (Reg) {
4984 case AMDGPU::M0:
4985 case AMDGPU::EXEC_LO:
4986 case AMDGPU::EXEC_HI:
4987 case AMDGPU::FLAT_SCR_LO:
4988 case AMDGPU::FLAT_SCR_HI:
4989 if (VT.getSizeInBits() == 32)
4990 return Reg;
4991 break;
4992 case AMDGPU::EXEC:
4993 case AMDGPU::FLAT_SCR:
4994 if (VT.getSizeInBits() == 64)
4995 return Reg;
4996 break;
4997 default:
4998 llvm_unreachable("missing register type checking");
4999 }
5000
5002 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
5003}
5004
5005// If kill is not the last instruction, split the block so kill is always a
5006// proper terminator.
5009 MachineBasicBlock *BB) const {
5010 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
5012 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
5013 return SplitBB;
5014}
5015
5016// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
5017// \p MI will be the only instruction in the loop body block. Otherwise, it will
5018// be the first instruction in the remainder block.
5019//
5020/// \returns { LoopBody, Remainder }
5021static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5023 MachineFunction *MF = MBB.getParent();
5025
5026 // To insert the loop we need to split the block. Move everything after this
5027 // point to a new block, and insert a new empty block between the two.
5029 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
5031 ++MBBI;
5032
5033 MF->insert(MBBI, LoopBB);
5034 MF->insert(MBBI, RemainderBB);
5035
5036 LoopBB->addSuccessor(LoopBB);
5037 LoopBB->addSuccessor(RemainderBB);
5038
5039 // Move the rest of the block into a new block.
5040 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
5041
5042 if (InstInLoop) {
5043 auto Next = std::next(I);
5044
5045 // Move instruction to loop body.
5046 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
5047
5048 // Move the rest of the block.
5049 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
5050 } else {
5051 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
5052 }
5053
5054 MBB.addSuccessor(LoopBB);
5055
5056 return std::pair(LoopBB, RemainderBB);
5057}
5058
5059/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5061 MachineBasicBlock *MBB = MI.getParent();
5063 auto I = MI.getIterator();
5064 auto E = std::next(I);
5065
5066 // clang-format off
5067 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
5068 .addImm(0);
5069 // clang-format on
5070
5071 MIBundleBuilder Bundler(*MBB, I, E);
5072 finalizeBundle(*MBB, Bundler.begin());
5073}
5074
5077 MachineBasicBlock *BB) const {
5078 const DebugLoc &DL = MI.getDebugLoc();
5079
5081
5083
5084 // Apparently kill flags are only valid if the def is in the same block?
5085 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
5086 Src->setIsKill(false);
5087
5088 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5089
5090 MachineBasicBlock::iterator I = LoopBB->end();
5091
5092 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5094
5095 // Clear TRAP_STS.MEM_VIOL
5096 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5097 .addImm(0)
5098 .addImm(EncodedReg);
5099
5101
5102 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5103
5104 // Load and check TRAP_STS.MEM_VIOL
5105 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5106 .addImm(EncodedReg);
5107
5108 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5109 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5110 .addReg(Reg, RegState::Kill)
5111 .addImm(0);
5112 // clang-format off
5113 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5114 .addMBB(LoopBB);
5115 // clang-format on
5116
5117 return RemainderBB;
5118}
5119
5120// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5121// wavefront. If the value is uniform and just happens to be in a VGPR, this
5122// will only do one iteration. In the worst case, this will loop 64 times.
5123//
5124// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5127 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5128 const DebugLoc &DL, const MachineOperand &Idx,
5129 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5130 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5131 Register &SGPRIdxReg) {
5132
5133 MachineFunction *MF = OrigBB.getParent();
5134 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5135 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5138
5139 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5140 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5141 Register NewExec = MRI.createVirtualRegister(BoolRC);
5142 Register CurrentIdxReg =
5143 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5144 Register CondReg = MRI.createVirtualRegister(BoolRC);
5145
5146 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5147 .addReg(InitReg)
5148 .addMBB(&OrigBB)
5149 .addReg(ResultReg)
5150 .addMBB(&LoopBB);
5151
5152 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5153 .addReg(InitSaveExecReg)
5154 .addMBB(&OrigBB)
5155 .addReg(NewExec)
5156 .addMBB(&LoopBB);
5157
5158 // Read the next variant <- also loop target.
5159 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5160 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5161
5162 // Compare the just read M0 value to all possible Idx values.
5163 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5164 .addReg(CurrentIdxReg)
5165 .addReg(Idx.getReg(), {}, Idx.getSubReg());
5166
5167 // Update EXEC, save the original EXEC value to VCC.
5168 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5169 .addReg(CondReg, RegState::Kill);
5170
5171 MRI.setSimpleHint(NewExec, CondReg);
5172
5173 if (UseGPRIdxMode) {
5174 if (Offset == 0) {
5175 SGPRIdxReg = CurrentIdxReg;
5176 } else {
5177 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5178 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5179 .addReg(CurrentIdxReg, RegState::Kill)
5180 .addImm(Offset);
5181 }
5182 } else {
5183 // Move index from VCC into M0
5184 if (Offset == 0) {
5185 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5186 .addReg(CurrentIdxReg, RegState::Kill);
5187 } else {
5188 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5189 .addReg(CurrentIdxReg, RegState::Kill)
5190 .addImm(Offset);
5191 }
5192 }
5193
5194 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5195 MachineInstr *InsertPt =
5196 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5197 .addReg(LMC.ExecReg)
5198 .addReg(NewExec);
5199
5200 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5201 // s_cbranch_scc0?
5202
5203 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5204 // clang-format off
5205 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5206 .addMBB(&LoopBB);
5207 // clang-format on
5208
5209 return InsertPt->getIterator();
5210}
5211
5212// This has slightly sub-optimal regalloc when the source vector is killed by
5213// the read. The register allocator does not understand that the kill is
5214// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5215// subregister from it, using 1 more VGPR than necessary. This was saved when
5216// this was expanded after register allocation.
5219 unsigned InitResultReg, unsigned PhiReg, int Offset,
5220 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5221 MachineFunction *MF = MBB.getParent();
5222 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5223 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5224 MachineRegisterInfo &MRI = MF->getRegInfo();
5225 const DebugLoc &DL = MI.getDebugLoc();
5227
5228 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5229 Register DstReg = MI.getOperand(0).getReg();
5230 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5231 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5233
5234 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5235
5236 // Save the EXEC mask
5237 // clang-format off
5238 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5239 .addReg(LMC.ExecReg);
5240 // clang-format on
5241
5242 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5243
5244 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5245
5246 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5247 InitResultReg, DstReg, PhiReg, TmpExec,
5248 Offset, UseGPRIdxMode, SGPRIdxReg);
5249
5250 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5252 ++MBBI;
5253 MF->insert(MBBI, LandingPad);
5254 LoopBB->removeSuccessor(RemainderBB);
5255 LandingPad->addSuccessor(RemainderBB);
5256 LoopBB->addSuccessor(LandingPad);
5257 MachineBasicBlock::iterator First = LandingPad->begin();
5258 // clang-format off
5259 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5260 .addReg(SaveExec);
5261 // clang-format on
5262
5263 return InsPt;
5264}
5265
5266// Returns subreg index, offset
5267static std::pair<unsigned, int>
5269 const TargetRegisterClass *SuperRC, unsigned VecReg,
5270 int Offset) {
5271 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5272
5273 // Skip out of bounds offsets, or else we would end up using an undefined
5274 // register.
5275 if (Offset >= NumElts || Offset < 0)
5276 return std::pair(AMDGPU::sub0, Offset);
5277
5278 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5279}
5280
5283 int Offset) {
5284 MachineBasicBlock *MBB = MI.getParent();
5285 const DebugLoc &DL = MI.getDebugLoc();
5287
5288 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5289
5290 assert(Idx->getReg() != AMDGPU::NoRegister);
5291
5292 if (Offset == 0) {
5293 // clang-format off
5294 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5295 .add(*Idx);
5296 // clang-format on
5297 } else {
5298 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5299 .add(*Idx)
5300 .addImm(Offset);
5301 }
5302}
5303
5306 int Offset) {
5307 MachineBasicBlock *MBB = MI.getParent();
5308 const DebugLoc &DL = MI.getDebugLoc();
5310
5311 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5312
5313 if (Offset == 0)
5314 return Idx->getReg();
5315
5316 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5317 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5318 .add(*Idx)
5319 .addImm(Offset);
5320 return Tmp;
5321}
5322
5325 const GCNSubtarget &ST) {
5326 const SIInstrInfo *TII = ST.getInstrInfo();
5327 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5328 MachineFunction *MF = MBB.getParent();
5329 MachineRegisterInfo &MRI = MF->getRegInfo();
5330
5331 Register Dst = MI.getOperand(0).getReg();
5332 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5333 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5334 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5335
5336 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5337 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5338
5339 unsigned SubReg;
5340 std::tie(SubReg, Offset) =
5341 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5342
5343 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5344
5345 // Check for a SGPR index.
5346 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5348 const DebugLoc &DL = MI.getDebugLoc();
5349
5350 if (UseGPRIdxMode) {
5351 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5352 // to avoid interfering with other uses, so probably requires a new
5353 // optimization pass.
5354 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5355
5356 const MCInstrDesc &GPRIDXDesc =
5357 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5358 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5359 .addReg(SrcReg)
5360 .addReg(Idx)
5361 .addImm(SubReg);
5362 } else {
5364
5365 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5366 .addReg(SrcReg, {}, SubReg)
5367 .addReg(SrcReg, RegState::Implicit);
5368 }
5369
5370 MI.eraseFromParent();
5371
5372 return &MBB;
5373 }
5374
5375 // Control flow needs to be inserted if indexing with a VGPR.
5376 const DebugLoc &DL = MI.getDebugLoc();
5378
5379 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5380 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5381
5382 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5383
5384 Register SGPRIdxReg;
5385 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5386 UseGPRIdxMode, SGPRIdxReg);
5387
5388 MachineBasicBlock *LoopBB = InsPt->getParent();
5389
5390 if (UseGPRIdxMode) {
5391 const MCInstrDesc &GPRIDXDesc =
5392 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5393
5394 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5395 .addReg(SrcReg)
5396 .addReg(SGPRIdxReg)
5397 .addImm(SubReg);
5398 } else {
5399 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5400 .addReg(SrcReg, {}, SubReg)
5401 .addReg(SrcReg, RegState::Implicit);
5402 }
5403
5404 MI.eraseFromParent();
5405
5406 return LoopBB;
5407}
5408
5411 const GCNSubtarget &ST) {
5412 const SIInstrInfo *TII = ST.getInstrInfo();
5413 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5414 MachineFunction *MF = MBB.getParent();
5415 MachineRegisterInfo &MRI = MF->getRegInfo();
5416
5417 Register Dst = MI.getOperand(0).getReg();
5418 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5419 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5420 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5421 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5422 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5423 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5424
5425 // This can be an immediate, but will be folded later.
5426 assert(Val->getReg());
5427
5428 unsigned SubReg;
5429 std::tie(SubReg, Offset) =
5430 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5431 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5432
5433 if (Idx->getReg() == AMDGPU::NoRegister) {
5435 const DebugLoc &DL = MI.getDebugLoc();
5436
5437 assert(Offset == 0);
5438
5439 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5440 .add(*SrcVec)
5441 .add(*Val)
5442 .addImm(SubReg);
5443
5444 MI.eraseFromParent();
5445 return &MBB;
5446 }
5447
5448 // Check for a SGPR index.
5449 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5451 const DebugLoc &DL = MI.getDebugLoc();
5452
5453 if (UseGPRIdxMode) {
5454 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5455
5456 const MCInstrDesc &GPRIDXDesc =
5457 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5458 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5459 .addReg(SrcVec->getReg())
5460 .add(*Val)
5461 .addReg(Idx)
5462 .addImm(SubReg);
5463 } else {
5465
5466 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5467 TRI.getRegSizeInBits(*VecRC), 32, false);
5468 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5469 .addReg(SrcVec->getReg())
5470 .add(*Val)
5471 .addImm(SubReg);
5472 }
5473 MI.eraseFromParent();
5474 return &MBB;
5475 }
5476
5477 // Control flow needs to be inserted if indexing with a VGPR.
5478 if (Val->isReg())
5479 MRI.clearKillFlags(Val->getReg());
5480
5481 const DebugLoc &DL = MI.getDebugLoc();
5482
5483 Register PhiReg = MRI.createVirtualRegister(VecRC);
5484
5485 Register SGPRIdxReg;
5486 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5487 UseGPRIdxMode, SGPRIdxReg);
5488 MachineBasicBlock *LoopBB = InsPt->getParent();
5489
5490 if (UseGPRIdxMode) {
5491 const MCInstrDesc &GPRIDXDesc =
5492 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5493
5494 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5495 .addReg(PhiReg)
5496 .add(*Val)
5497 .addReg(SGPRIdxReg)
5498 .addImm(SubReg);
5499 } else {
5500 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5501 TRI.getRegSizeInBits(*VecRC), 32, false);
5502 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5503 .addReg(PhiReg)
5504 .add(*Val)
5505 .addImm(SubReg);
5506 }
5507
5508 MI.eraseFromParent();
5509 return LoopBB;
5510}
5511
5513 MachineBasicBlock *BB) {
5514 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5515 // For GFX12, we emit s_add_u64 and s_sub_u64.
5516 MachineFunction *MF = BB->getParent();
5517 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5518 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5520 const DebugLoc &DL = MI.getDebugLoc();
5521 MachineOperand &Dest = MI.getOperand(0);
5522 MachineOperand &Src0 = MI.getOperand(1);
5523 MachineOperand &Src1 = MI.getOperand(2);
5524 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5525 if (ST.hasScalarAddSub64()) {
5526 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5527 // clang-format off
5528 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5529 .add(Src0)
5530 .add(Src1);
5531 // clang-format on
5532 } else {
5533 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5534 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5535
5536 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5537 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5538
5539 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5540 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5541 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5542 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5543
5544 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5545 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5546 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5547 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5548
5549 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5550 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5551 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5552 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5553 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5554 .addReg(DestSub0)
5555 .addImm(AMDGPU::sub0)
5556 .addReg(DestSub1)
5557 .addImm(AMDGPU::sub1);
5558 }
5559 MI.eraseFromParent();
5560 return BB;
5561}
5562
5564 switch (Opc) {
5565 case AMDGPU::S_MIN_U32:
5566 return std::numeric_limits<uint32_t>::max();
5567 case AMDGPU::S_MIN_I32:
5568 return std::numeric_limits<int32_t>::max();
5569 case AMDGPU::S_MAX_U32:
5570 return std::numeric_limits<uint32_t>::min();
5571 case AMDGPU::S_MAX_I32:
5572 return std::numeric_limits<int32_t>::min();
5573 case AMDGPU::V_ADD_F32_e64: // -0.0
5574 return 0x80000000;
5575 case AMDGPU::V_SUB_F32_e64: // +0.0
5576 return 0x0;
5577 case AMDGPU::S_ADD_I32:
5578 case AMDGPU::S_SUB_I32:
5579 case AMDGPU::S_OR_B32:
5580 case AMDGPU::S_XOR_B32:
5581 return std::numeric_limits<uint32_t>::min();
5582 case AMDGPU::S_AND_B32:
5583 return std::numeric_limits<uint32_t>::max();
5584 case AMDGPU::V_MIN_F32_e64:
5585 case AMDGPU::V_MAX_F32_e64:
5586 return 0x7fc00000; // qNAN
5587 default:
5589 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5590 }
5591}
5592
5594 switch (Opc) {
5595 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5596 return std::numeric_limits<uint64_t>::max();
5597 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5598 return std::numeric_limits<int64_t>::max();
5599 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5600 return std::numeric_limits<uint64_t>::min();
5601 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5602 return std::numeric_limits<int64_t>::min();
5603 case AMDGPU::V_MIN_F64_e64:
5604 case AMDGPU::V_MAX_F64_e64:
5605 case AMDGPU::V_MIN_NUM_F64_e64:
5606 case AMDGPU::V_MAX_NUM_F64_e64:
5607 return 0x7FF8000000000000; // qNAN
5608 case AMDGPU::S_ADD_U64_PSEUDO:
5609 case AMDGPU::S_SUB_U64_PSEUDO:
5610 case AMDGPU::S_OR_B64:
5611 case AMDGPU::S_XOR_B64:
5612 return std::numeric_limits<uint64_t>::min();
5613 case AMDGPU::S_AND_B64:
5614 return std::numeric_limits<uint64_t>::max();
5615 case AMDGPU::V_ADD_F64_e64:
5616 case AMDGPU::V_ADD_F64_pseudo_e64:
5617 return 0x8000000000000000; // -0.0
5618 default:
5620 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5621 }
5622}
5623
5624static bool is32bitWaveReduceOperation(unsigned Opc) {
5625 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5626 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5627 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5628 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5629 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5630 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5631 Opc == AMDGPU::V_SUB_F32_e64;
5632}
5633
5635 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5636 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5637 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5638 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5639 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5640}
5641
5642static unsigned getDPPOpcForWaveReduction(unsigned Opc,
5643 const GCNSubtarget &ST) {
5644 switch (Opc) {
5645 case AMDGPU::S_MIN_U32:
5646 return AMDGPU::V_MIN_U32_dpp;
5647 case AMDGPU::S_MIN_I32:
5648 return AMDGPU::V_MIN_I32_dpp;
5649 case AMDGPU::S_MAX_U32:
5650 return AMDGPU::V_MAX_U32_dpp;
5651 case AMDGPU::S_MAX_I32:
5652 return AMDGPU::V_MAX_I32_dpp;
5653 case AMDGPU::S_ADD_I32:
5654 case AMDGPU::S_SUB_I32:
5655 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
5656 : AMDGPU::V_ADD_CO_U32_dpp;
5657 case AMDGPU::S_AND_B32:
5658 return AMDGPU::V_AND_B32_dpp;
5659 case AMDGPU::S_OR_B32:
5660 return AMDGPU::V_OR_B32_dpp;
5661 case AMDGPU::S_XOR_B32:
5662 return AMDGPU::V_XOR_B32_dpp;
5663 default:
5664 llvm_unreachable("unhandled lane op");
5665 }
5666}
5667
5670 const GCNSubtarget &ST,
5671 unsigned Opc) {
5673 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5674 const DebugLoc &DL = MI.getDebugLoc();
5675 const SIInstrInfo *TII = ST.getInstrInfo();
5676
5677 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5678 Register SrcReg = MI.getOperand(1).getReg();
5679 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5680 Register DstReg = MI.getOperand(0).getReg();
5681 unsigned Stratergy = static_cast<unsigned>(MI.getOperand(2).getImm());
5682 enum WAVE_REDUCE_STRATEGY : unsigned { DEFAULT = 0, ITERATIVE = 1, DPP = 2 };
5683 MachineBasicBlock *RetBB = nullptr;
5684 if (isSGPR) {
5685 switch (Opc) {
5686 case AMDGPU::S_MIN_U32:
5687 case AMDGPU::S_MIN_I32:
5688 case AMDGPU::V_MIN_F32_e64:
5689 case AMDGPU::S_MAX_U32:
5690 case AMDGPU::S_MAX_I32:
5691 case AMDGPU::V_MAX_F32_e64:
5692 case AMDGPU::S_AND_B32:
5693 case AMDGPU::S_OR_B32: {
5694 // Idempotent operations.
5695 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5696 RetBB = &BB;
5697 break;
5698 }
5699 case AMDGPU::V_CMP_LT_U64_e64: // umin
5700 case AMDGPU::V_CMP_LT_I64_e64: // min
5701 case AMDGPU::V_CMP_GT_U64_e64: // umax
5702 case AMDGPU::V_CMP_GT_I64_e64: // max
5703 case AMDGPU::V_MIN_F64_e64:
5704 case AMDGPU::V_MIN_NUM_F64_e64:
5705 case AMDGPU::V_MAX_F64_e64:
5706 case AMDGPU::V_MAX_NUM_F64_e64:
5707 case AMDGPU::S_AND_B64:
5708 case AMDGPU::S_OR_B64: {
5709 // Idempotent operations.
5710 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5711 RetBB = &BB;
5712 break;
5713 }
5714 case AMDGPU::S_XOR_B32:
5715 case AMDGPU::S_XOR_B64:
5716 case AMDGPU::S_ADD_I32:
5717 case AMDGPU::S_ADD_U64_PSEUDO:
5718 case AMDGPU::V_ADD_F32_e64:
5719 case AMDGPU::V_ADD_F64_e64:
5720 case AMDGPU::V_ADD_F64_pseudo_e64:
5721 case AMDGPU::S_SUB_I32:
5722 case AMDGPU::S_SUB_U64_PSEUDO:
5723 case AMDGPU::V_SUB_F32_e64: {
5724 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5725 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5726 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5727 Register NumActiveLanes =
5728 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5729
5730 bool IsWave32 = ST.isWave32();
5731 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5732 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5733 unsigned BitCountOpc =
5734 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5735
5736 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5737
5738 auto NewAccumulator =
5739 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5740 .addReg(ExecMask);
5741
5742 switch (Opc) {
5743 case AMDGPU::S_XOR_B32:
5744 case AMDGPU::S_XOR_B64: {
5745 // Performing an XOR operation on a uniform value
5746 // depends on the parity of the number of active lanes.
5747 // For even parity, the result will be 0, for odd
5748 // parity the result will be the same as the input value.
5749 Register ParityRegister =
5750 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5751
5752 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5753 .addReg(NewAccumulator->getOperand(0).getReg())
5754 .addImm(1)
5755 .setOperandDead(3); // Dead scc
5756 if (Opc == AMDGPU::S_XOR_B32) {
5757 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5758 .addReg(SrcReg)
5759 .addReg(ParityRegister);
5760 } else {
5761 Register DestSub0 =
5762 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5763 Register DestSub1 =
5764 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5765
5766 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5767 const TargetRegisterClass *SrcSubRC =
5768 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5769
5770 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5771 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5772 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5773 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5774
5775 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5776 .add(Op1L)
5777 .addReg(ParityRegister);
5778
5779 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5780 .add(Op1H)
5781 .addReg(ParityRegister);
5782
5783 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5784 .addReg(DestSub0)
5785 .addImm(AMDGPU::sub0)
5786 .addReg(DestSub1)
5787 .addImm(AMDGPU::sub1);
5788 }
5789 break;
5790 }
5791 case AMDGPU::S_SUB_I32: {
5792 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5793
5794 // Take the negation of the source operand.
5795 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5796 .addImm(0)
5797 .addReg(SrcReg);
5798 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5799 .addReg(NegatedVal)
5800 .addReg(NewAccumulator->getOperand(0).getReg());
5801 break;
5802 }
5803 case AMDGPU::S_ADD_I32: {
5804 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5805 .addReg(SrcReg)
5806 .addReg(NewAccumulator->getOperand(0).getReg());
5807 break;
5808 }
5809 case AMDGPU::S_ADD_U64_PSEUDO:
5810 case AMDGPU::S_SUB_U64_PSEUDO: {
5811 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5812 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5813 Register Op1H_Op0L_Reg =
5814 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5815 Register Op1L_Op0H_Reg =
5816 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5817 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5818 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5819 Register NegatedValLo =
5820 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5821 Register NegatedValHi =
5822 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5823
5824 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5825 const TargetRegisterClass *Src1SubRC =
5826 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5827
5828 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5829 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5830 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5831 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5832
5833 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5834 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5835 .addImm(0)
5836 .addReg(NewAccumulator->getOperand(0).getReg())
5837 .setOperandDead(3); // Dead scc
5838 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5839 .addReg(NegatedValLo)
5840 .addImm(31)
5841 .setOperandDead(3); // Dead scc
5842 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5843 .add(Op1L)
5844 .addReg(NegatedValHi);
5845 }
5846 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5847 ? NegatedValLo
5848 : NewAccumulator->getOperand(0).getReg();
5849 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5850 .add(Op1L)
5851 .addReg(LowOpcode);
5852 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5853 .add(Op1L)
5854 .addReg(LowOpcode);
5855 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5856 .add(Op1H)
5857 .addReg(LowOpcode);
5858
5859 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5860 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5861 .addReg(CarryReg)
5862 .addReg(Op1H_Op0L_Reg)
5863 .setOperandDead(3); // Dead scc
5864
5865 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5866 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5867 .addReg(HiVal)
5868 .addReg(Op1L_Op0H_Reg)
5869 .setOperandDead(3); // Dead scc
5870 }
5871 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5872 .addReg(DestSub0)
5873 .addImm(AMDGPU::sub0)
5874 .addReg(DestSub1)
5875 .addImm(AMDGPU::sub1);
5876 break;
5877 }
5878 case AMDGPU::V_ADD_F32_e64:
5879 case AMDGPU::V_ADD_F64_e64:
5880 case AMDGPU::V_ADD_F64_pseudo_e64:
5881 case AMDGPU::V_SUB_F32_e64: {
5882 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5883 const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0);
5884 Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);
5885 Register DstVreg = MRI.createVirtualRegister(VregRC);
5886 // Get number of active lanes as a float val.
5887 BuildMI(BB, MI, DL,
5888 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5889 : AMDGPU::V_CVT_F64_I32_e64),
5890 ActiveLanesVreg)
5891 .addReg(NewAccumulator->getOperand(0).getReg())
5892 .addImm(0) // clamp
5893 .addImm(0); // output-modifier
5894
5895 // Take negation of input for SUB reduction
5896 unsigned srcMod =
5897 (Opc == AMDGPU::V_SUB_F32_e64 ||
5898 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5901 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5902 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
5903 ? AMDGPU::V_MUL_F64_pseudo_e64
5904 : AMDGPU::V_MUL_F64_e64;
5905 auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),
5906 DstVreg)
5907 .addImm(srcMod) // src0 modifier
5908 .addReg(SrcReg)
5909 .addImm(SISrcMods::NONE) // src1 modifier
5910 .addReg(ActiveLanesVreg)
5911 .addImm(SISrcMods::NONE) // clamp
5912 .addImm(SISrcMods::NONE); // output-mod
5913 if (is32BitOpc) {
5914 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5915 .addReg(DstVreg);
5916 } else {
5917 Register LaneValueLoReg =
5918 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5919 Register LaneValueHiReg =
5920 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5921 const TargetRegisterClass *VregSubRC =
5922 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5923 MachineOperand Op1L =
5924 TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
5925 VregRC, AMDGPU::sub0, VregSubRC);
5926 MachineOperand Op1H =
5927 TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
5928 VregRC, AMDGPU::sub1, VregSubRC);
5929 // lane value input should be in an sgpr
5930 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
5931 LaneValueLoReg)
5932 .add(Op1L);
5933 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
5934 LaneValueHiReg)
5935 .add(Op1H);
5936 NewAccumulator =
5937 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5938 .addReg(LaneValueLoReg)
5939 .addImm(AMDGPU::sub0)
5940 .addReg(LaneValueHiReg)
5941 .addImm(AMDGPU::sub1);
5942 }
5943 }
5944 }
5945 RetBB = &BB;
5946 }
5947 }
5948 } else {
5950 Register SrcReg = MI.getOperand(1).getReg();
5951 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5953 // Create virtual registers required for lowering.
5954 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5955 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5956 const TargetRegisterClass *SrcRegClass = MRI.getRegClass(SrcReg);
5957 bool IsWave32 = ST.isWave32();
5958 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5959 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5960 if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
5961 !ST.hasDPP()) { // If target doesn't support DPP operations, default to
5962 // iterative stratergy
5963
5964 // To reduce the VGPR using iterative approach, we need to iterate
5965 // over all the active lanes. Lowering consists of ComputeLoop,
5966 // which iterate over only active lanes. We use copy of EXEC register
5967 // as induction variable and every active lane modifies it using bitset0
5968 // so that we will get the next active lane for next iteration.
5969
5970 // Create Control flow for loop
5971 // Split MI's Machine Basic block into For loop
5972 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5973
5974 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5975 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5976 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5977 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5978 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5979 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5980 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5981
5982 // Create initial values of induction variable from Exec, Accumulator and
5983 // insert branch instr to newly created ComputeBlock
5984 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5985 if (is32BitOpc) {
5987 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5988 .addImm(IdentityValue);
5989 } else {
5990 uint64_t IdentityValue =
5991 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5992 ? 0x0 // +0.0 for double sub reduction
5994 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO),
5995 IdentityValReg)
5996 .addImm(IdentityValue);
5997 }
5998 // clang-format off
5999 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
6000 .addMBB(ComputeLoop);
6001 // clang-format on
6002
6003 // Start constructing ComputeLoop
6004 I = ComputeLoop->begin();
6005 auto Accumulator =
6006 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
6007 .addReg(IdentityValReg)
6008 .addMBB(&BB);
6009 auto ActiveBits =
6010 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
6011 .addReg(LoopIterator)
6012 .addMBB(&BB);
6013
6014 I = ComputeLoop->end();
6015 MachineInstr *NewAccumulator;
6016 // Perform the computations
6017 unsigned SFFOpc =
6018 IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
6019 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
6020 .addReg(ActiveBitsReg);
6021 if (is32BitOpc) {
6022 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6023 LaneValueReg)
6024 .addReg(SrcReg)
6025 .addReg(FF1Reg);
6026 if (isFPOp) {
6027 Register LaneValVreg =
6028 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
6029 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
6030 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
6031 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
6032 LaneValVreg)
6033 .addReg(LaneValueReg);
6034 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6035 .addImm(0) // src0 modifier
6036 .addReg(Accumulator->getOperand(0).getReg())
6037 .addImm(0) // src1 modifier
6038 .addReg(LaneValVreg)
6039 .addImm(0) // clamp
6040 .addImm(0); // omod
6041 NewAccumulator =
6042 BuildMI(*ComputeLoop, I, DL,
6043 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6044 .addReg(DstVreg);
6045 } else {
6046 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6047 .addReg(Accumulator->getOperand(0).getReg())
6048 .addReg(LaneValueReg);
6049 }
6050 } else {
6051 Register LaneValueLoReg =
6052 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6053 Register LaneValueHiReg =
6054 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6055 Register LaneValReg =
6056 MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6057 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
6058 const TargetRegisterClass *SrcSubRC =
6059 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
6060 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
6061 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
6062 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
6063 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
6064 // lane value input should be in an sgpr
6065 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6066 LaneValueLoReg)
6067 .add(Op1L)
6068 .addReg(FF1Reg);
6069 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6070 LaneValueHiReg)
6071 .add(Op1H)
6072 .addReg(FF1Reg);
6073 auto LaneValue =
6074 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
6075 LaneValReg)
6076 .addReg(LaneValueLoReg)
6077 .addImm(AMDGPU::sub0)
6078 .addReg(LaneValueHiReg)
6079 .addImm(AMDGPU::sub1);
6080 switch (Opc) {
6081 case AMDGPU::S_OR_B64:
6082 case AMDGPU::S_AND_B64:
6083 case AMDGPU::S_XOR_B64: {
6084 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6085 .addReg(Accumulator->getOperand(0).getReg())
6086 .addReg(LaneValue->getOperand(0).getReg())
6087 .setOperandDead(3); // Dead scc
6088 break;
6089 }
6090 case AMDGPU::V_CMP_GT_I64_e64:
6091 case AMDGPU::V_CMP_GT_U64_e64:
6092 case AMDGPU::V_CMP_LT_I64_e64:
6093 case AMDGPU::V_CMP_LT_U64_e64: {
6094 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
6095 Register ComparisonResultReg =
6096 MRI.createVirtualRegister(WaveMaskRegClass);
6097 int SrcIdx =
6098 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6099 const TargetRegisterClass *VregClass =
6100 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6101 const TargetRegisterClass *VSubRegClass =
6102 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6103 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
6104 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6105 MI, MRI, Accumulator->getOperand(0), VregClass, AMDGPU::sub0,
6106 VSubRegClass);
6107 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6108 MI, MRI, Accumulator->getOperand(0), VregClass, AMDGPU::sub1,
6109 VSubRegClass);
6110 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
6111 AccumulatorVReg)
6112 .add(SrcReg0Sub0)
6113 .addImm(AMDGPU::sub0)
6114 .add(SrcReg0Sub1)
6115 .addImm(AMDGPU::sub1);
6116 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
6117 .addReg(LaneValue->getOperand(0).getReg())
6118 .addReg(AccumulatorVReg);
6119
6120 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6121 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
6122 .addReg(LaneMaskReg)
6123 .addReg(ActiveBitsReg);
6124
6125 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6126 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6127 .addReg(LaneValue->getOperand(0).getReg())
6128 .addReg(Accumulator->getOperand(0).getReg());
6129 break;
6130 }
6131 case AMDGPU::V_MIN_F64_e64:
6132 case AMDGPU::V_MIN_NUM_F64_e64:
6133 case AMDGPU::V_MAX_F64_e64:
6134 case AMDGPU::V_MAX_NUM_F64_e64:
6135 case AMDGPU::V_ADD_F64_e64:
6136 case AMDGPU::V_ADD_F64_pseudo_e64: {
6137 int SrcIdx =
6138 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6139 const TargetRegisterClass *VregRC =
6140 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6141 const TargetRegisterClass *VregSubRC =
6142 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6143 Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
6144 Register DstVreg = MRI.createVirtualRegister(VregRC);
6145 Register LaneValLo =
6146 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6147 Register LaneValHi =
6148 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6149 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
6150 .addReg(Accumulator->getOperand(0).getReg());
6151 unsigned Modifier =
6152 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6155 auto DstVregInst =
6156 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6157 .addImm(Modifier) // src0 modifiers
6158 .addReg(LaneValue->getOperand(0).getReg())
6159 .addImm(SISrcMods::NONE) // src1 modifiers
6160 .addReg(AccumulatorVReg)
6161 .addImm(SISrcMods::NONE) // clamp
6162 .addImm(SISrcMods::NONE); // omod
6163 auto ReadLaneLo =
6164 BuildMI(*ComputeLoop, I, DL,
6165 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
6166 auto ReadLaneHi =
6167 BuildMI(*ComputeLoop, I, DL,
6168 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
6169 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6170 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
6171 Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub0,
6172 VregSubRC);
6173 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
6174 Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub1,
6175 VregSubRC);
6176 ReadLaneLo.add(Op1L);
6177 ReadLaneHi.add(Op1H);
6178 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6179 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
6180 .addReg(LaneValLo)
6181 .addImm(AMDGPU::sub0)
6182 .addReg(LaneValHi)
6183 .addImm(AMDGPU::sub1);
6184 break;
6185 }
6186 case AMDGPU::S_ADD_U64_PSEUDO:
6187 case AMDGPU::S_SUB_U64_PSEUDO: {
6188 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6189 .addReg(Accumulator->getOperand(0).getReg())
6190 .addReg(LaneValue->getOperand(0).getReg());
6191 ComputeLoop =
6192 Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
6193 break;
6194 }
6195 }
6196 }
6197 // Manipulate the iterator to get the next active lane
6198 unsigned BITSETOpc =
6199 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6200 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
6201 .addReg(FF1Reg)
6202 .addReg(ActiveBitsReg);
6203
6204 // Add phi nodes
6205 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
6206 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6207
6208 // Creating branching
6209 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6210 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
6211 .addReg(NewActiveBitsReg)
6212 .addImm(0);
6213 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6214 .addMBB(ComputeLoop);
6215
6216 RetBB = ComputeEnd;
6217 } else {
6218 assert(ST.hasDPP() && "Sub Target does not support DPP Operations");
6219
6220 Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass);
6221 Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass);
6222 Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass);
6223 Register DPPRowShr1 = MRI.createVirtualRegister(SrcRegClass);
6224 Register DPPRowShr2 = MRI.createVirtualRegister(SrcRegClass);
6225 Register DPPRowShr4 = MRI.createVirtualRegister(SrcRegClass);
6226 Register DPPRowShr8 = MRI.createVirtualRegister(SrcRegClass);
6227 Register RowBcast15 = MRI.createVirtualRegister(SrcRegClass);
6228 Register ReducedValSGPR = MRI.createVirtualRegister(DstRegClass);
6229 Register NegatedReducedVal = MRI.createVirtualRegister(DstRegClass);
6230 Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass);
6231 Register UndefExec = MRI.createVirtualRegister(WaveMaskRegClass);
6232 Register FinalDPPResult;
6233 BuildMI(BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
6234
6236 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), IdentitySGPR)
6237 .addImm(IdentityValue);
6238 BuildMI(BB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
6239 .addReg(IdentitySGPR);
6240
6241 // Set inactive lanes to the identity value.
6242 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SET_INACTIVE_B32), SrcWithIdentity)
6243 .addImm(0) // src0 modifiers
6244 .addReg(SrcReg) // src0
6245 .addImm(0) // src1 modifiers
6246 .addReg(IdentityVGPR) // identity value for inactive lanes
6247 .addReg(UndefExec); // bool i1
6248
6249 unsigned DPPOpc = getDPPOpcForWaveReduction(Opc, ST);
6250 auto BuildDPPMachineInstr = [&](Register Dst, Register Src,
6251 unsigned DPPCtrl) {
6252 BuildMI(BB, MI, DL, TII->get(DPPOpc), Dst)
6253 .addReg(Src) // old
6254 .addReg(Src) // src0
6255 .addReg(Src) // src1
6256 .addImm(DPPCtrl) // dpp-ctrl
6257 .addImm(0xf) // row-mask
6258 .addImm(0xf) // bank-mask
6259 .addImm(0); // bound-control
6260 };
6261 // DPP reduction
6262 BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentity,
6264
6265 BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
6267
6268 BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
6270
6271 BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
6273
6274 if (ST.hasDPPBroadcasts()) {
6275 BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15);
6276 } else {
6277 // magic constant: 0x1E0
6278 // To Set BIT_MODE : bit 15 = 0
6279 // XOR mask : bit [14:10] = 0
6280 // OR mask : bit [9:5] = 15
6281 // AND mask : bit [4:0] = 0
6282 Register SwizzledValue =
6283 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6284 BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32), SwizzledValue)
6285 .addReg(DPPRowShr8) // addr
6286 .addImm(0x1E0) // swizzle offset (i16)
6287 .addImm(0x0); // gds (i1)
6288 auto ClampInstr =
6289 BuildMI(BB, MI, DL,
6290 TII->get(TII->getVALUOp(
6291 Opc == AMDGPU::S_SUB_I32
6292 ? static_cast<unsigned>(AMDGPU::S_ADD_I32)
6293 : Opc)),
6294 RowBcast15)
6295 .addReg(DPPRowShr8)
6296 .addReg(SwizzledValue);
6297 if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr))
6298 ClampInstr.addImm(0);
6299 }
6300 FinalDPPResult = RowBcast15;
6301 if (!IsWave32) {
6302 if (ST.hasDPPBroadcasts()) {
6303 BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31);
6304 } else {
6305 Register ShiftedThreadID =
6306 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6307 Register PermuteByteOffset =
6308 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6309 Register PermutedValue =
6310 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6311 Register Lane32Offset = MRI.createVirtualRegister(DstRegClass);
6312 Register WordSizeConst = MRI.createVirtualRegister(DstRegClass);
6313 Register ThreadIDRegLo =
6314 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6315 Register ThreadIDReg =
6316 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6317 // Get the thread ID.
6318 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
6319 ThreadIDRegLo)
6320 .addImm(-1)
6321 .addImm(0);
6322 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
6323 ThreadIDReg)
6324 .addImm(-1)
6325 .addReg(ThreadIDRegLo);
6326 // shift each lane over by 32 positions, so value in 31st lane is
6327 // present in 63rd lane.
6328 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
6329 .addImm(0x20);
6330 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), ShiftedThreadID)
6331 .addReg(ThreadIDReg)
6332 .addReg(Lane32Offset)
6333 .addImm(0); // clamp
6334 // multiply by reg size.
6335 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
6336 .addImm(0x4);
6337 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),
6338 PermuteByteOffset)
6339 .addReg(WordSizeConst)
6340 .addReg(ShiftedThreadID);
6341 // Permute the lanes
6342 BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32), PermutedValue)
6343 .addReg(PermuteByteOffset) // addr
6344 .addReg(RowBcast15) // data
6345 .addImm(0); // offset
6346 auto ClampInstr =
6347 BuildMI(BB, MI, DL,
6348 TII->get(TII->getVALUOp(
6349 Opc == AMDGPU::S_SUB_I32
6350 ? static_cast<unsigned>(AMDGPU::S_ADD_I32)
6351 : Opc)),
6352 RowBcast31)
6353 .addReg(RowBcast15)
6354 .addReg(PermutedValue);
6355 if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr))
6356 ClampInstr.addImm(0);
6357 }
6358 FinalDPPResult = RowBcast31;
6359 }
6360 // The final reduced value is in the last lane.
6361 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), ReducedValSGPR)
6362 .addReg(FinalDPPResult)
6363 .addImm(ST.getWavefrontSize() - 1);
6364 if (Opc == AMDGPU::S_SUB_I32)
6365 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
6366 .addImm(0)
6367 .addReg(ReducedValSGPR);
6368 // Mark the final result as a whole-wave-mode calculation.
6369 BuildMI(BB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
6370 .addReg(Opc == AMDGPU::S_SUB_I32 ? NegatedReducedVal
6371 : ReducedValSGPR);
6372 RetBB = &BB;
6373 }
6374 }
6375 MI.eraseFromParent();
6376 return RetBB;
6377}
6378
6381 MachineBasicBlock *BB) const {
6382 MachineFunction *MF = BB->getParent();
6384 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6386 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6387 MachineRegisterInfo &MRI = MF->getRegInfo();
6388 const DebugLoc &DL = MI.getDebugLoc();
6389
6390 switch (MI.getOpcode()) {
6391 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6392 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
6393 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6394 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
6395 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6396 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
6397 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6398 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6399 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6400 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6401 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6402 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6403 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6404 ? AMDGPU::V_MIN_NUM_F64_e64
6405 : AMDGPU::V_MIN_F64_e64);
6406 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6407 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6408 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6409 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6410 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6411 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6412 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6413 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6414 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6415 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6416 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6417 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6418 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6419 ? AMDGPU::V_MAX_NUM_F64_e64
6420 : AMDGPU::V_MAX_F64_e64);
6421 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6422 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6423 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6424 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6425 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6426 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6427 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6428 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6429 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6430 ? AMDGPU::V_ADD_F64_pseudo_e64
6431 : AMDGPU::V_ADD_F64_e64);
6432 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6433 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6434 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6435 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6436 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6437 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6438 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6439 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6440 // fadd + neg, by setting the NEG bit in the instruction.
6441 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6442 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6443 ? AMDGPU::V_ADD_F64_pseudo_e64
6444 : AMDGPU::V_ADD_F64_e64);
6445 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6446 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6447 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6448 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6449 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6450 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6451 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6452 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6453 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6454 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6455 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6456 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6457 case AMDGPU::S_UADDO_PSEUDO:
6458 case AMDGPU::S_USUBO_PSEUDO: {
6459 MachineOperand &Dest0 = MI.getOperand(0);
6460 MachineOperand &Dest1 = MI.getOperand(1);
6461 MachineOperand &Src0 = MI.getOperand(2);
6462 MachineOperand &Src1 = MI.getOperand(3);
6463
6464 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6465 ? AMDGPU::S_ADD_U32
6466 : AMDGPU::S_SUB_U32;
6467 // clang-format off
6468 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6469 .add(Src0)
6470 .add(Src1);
6471 // clang-format on
6472
6473 unsigned SelOpc =
6474 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6475 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6476
6477 MI.eraseFromParent();
6478 return BB;
6479 }
6480 case AMDGPU::S_ADD_U64_PSEUDO:
6481 case AMDGPU::S_SUB_U64_PSEUDO: {
6482 return Expand64BitScalarArithmetic(MI, BB);
6483 }
6484 case AMDGPU::V_ADD_U64_PSEUDO:
6485 case AMDGPU::V_SUB_U64_PSEUDO: {
6486 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6487
6488 MachineOperand &Dest = MI.getOperand(0);
6489 MachineOperand &Src0 = MI.getOperand(1);
6490 MachineOperand &Src1 = MI.getOperand(2);
6491
6492 if (ST.hasAddSubU64Insts()) {
6493 auto I = BuildMI(*BB, MI, DL,
6494 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6495 : AMDGPU::V_SUB_U64_e64),
6496 Dest.getReg())
6497 .add(Src0)
6498 .add(Src1)
6499 .addImm(0); // clamp
6500 TII->legalizeOperands(*I);
6501 MI.eraseFromParent();
6502 return BB;
6503 }
6504
6505 if (IsAdd && ST.hasLshlAddU64Inst()) {
6506 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6507 Dest.getReg())
6508 .add(Src0)
6509 .addImm(0)
6510 .add(Src1);
6511 TII->legalizeOperands(*Add);
6512 MI.eraseFromParent();
6513 return BB;
6514 }
6515
6516 const auto *CarryRC = TRI->getWaveMaskRegClass();
6517
6518 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6519 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6520
6521 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6522 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6523
6524 const TargetRegisterClass *Src0RC = Src0.isReg()
6525 ? MRI.getRegClass(Src0.getReg())
6526 : &AMDGPU::VReg_64RegClass;
6527 const TargetRegisterClass *Src1RC = Src1.isReg()
6528 ? MRI.getRegClass(Src1.getReg())
6529 : &AMDGPU::VReg_64RegClass;
6530
6531 const TargetRegisterClass *Src0SubRC =
6532 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6533 const TargetRegisterClass *Src1SubRC =
6534 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6535
6536 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6537 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6538 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6539 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6540
6541 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6542 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6543 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6544 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6545
6546 unsigned LoOpc =
6547 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6548 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6549 .addReg(CarryReg, RegState::Define)
6550 .add(SrcReg0Sub0)
6551 .add(SrcReg1Sub0)
6552 .addImm(0); // clamp bit
6553
6554 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6555 MachineInstr *HiHalf =
6556 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6557 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6558 .add(SrcReg0Sub1)
6559 .add(SrcReg1Sub1)
6560 .addReg(CarryReg, RegState::Kill)
6561 .addImm(0); // clamp bit
6562
6563 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6564 .addReg(DestSub0)
6565 .addImm(AMDGPU::sub0)
6566 .addReg(DestSub1)
6567 .addImm(AMDGPU::sub1);
6568 TII->legalizeOperands(*LoHalf);
6569 TII->legalizeOperands(*HiHalf);
6570 MI.eraseFromParent();
6571 return BB;
6572 }
6573 case AMDGPU::S_ADD_CO_PSEUDO:
6574 case AMDGPU::S_SUB_CO_PSEUDO: {
6575 // This pseudo has a chance to be selected
6576 // only from uniform add/subcarry node. All the VGPR operands
6577 // therefore assumed to be splat vectors.
6579 MachineOperand &Dest = MI.getOperand(0);
6580 MachineOperand &CarryDest = MI.getOperand(1);
6581 MachineOperand &Src0 = MI.getOperand(2);
6582 MachineOperand &Src1 = MI.getOperand(3);
6583 MachineOperand &Src2 = MI.getOperand(4);
6584 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6585 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6586 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6587 .addReg(Src0.getReg());
6588 Src0.setReg(RegOp0);
6589 }
6590 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6591 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6592 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6593 .addReg(Src1.getReg());
6594 Src1.setReg(RegOp1);
6595 }
6596 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6597 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6598 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6599 .addReg(Src2.getReg());
6600 Src2.setReg(RegOp2);
6601 }
6602
6603 if (ST.isWave64()) {
6604 if (ST.hasScalarCompareEq64()) {
6605 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6606 .addReg(Src2.getReg())
6607 .addImm(0);
6608 } else {
6609 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6610 const TargetRegisterClass *SubRC =
6611 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6612 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6613 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6614 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6615 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6616 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6617
6618 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6619 .add(Src2Sub0)
6620 .add(Src2Sub1);
6621
6622 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6623 .addReg(Src2_32, RegState::Kill)
6624 .addImm(0);
6625 }
6626 } else {
6627 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6628 .addReg(Src2.getReg())
6629 .addImm(0);
6630 }
6631
6632 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6633 ? AMDGPU::S_ADDC_U32
6634 : AMDGPU::S_SUBB_U32;
6635
6636 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6637
6638 unsigned SelOpc =
6639 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6640
6641 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6642 .addImm(-1)
6643 .addImm(0);
6644
6645 MI.eraseFromParent();
6646 return BB;
6647 }
6648 case AMDGPU::SI_INIT_M0: {
6649 MachineOperand &M0Init = MI.getOperand(0);
6650 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6651 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6652 AMDGPU::M0)
6653 .add(M0Init);
6654 MI.eraseFromParent();
6655 return BB;
6656 }
6657 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6658 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6659 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6660 TII->get(AMDGPU::S_CMP_EQ_U32))
6661 .addImm(0)
6662 .addImm(0);
6663 return BB;
6664 }
6665 case AMDGPU::GET_GROUPSTATICSIZE: {
6666 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6667 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6668 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6669 .add(MI.getOperand(0))
6670 .addImm(MFI->getLDSSize());
6671 MI.eraseFromParent();
6672 return BB;
6673 }
6674 case AMDGPU::GET_SHADERCYCLESHILO: {
6675 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
6676 // The algorithm is:
6677 //
6678 // hi1 = getreg(SHADER_CYCLES_HI)
6679 // lo1 = getreg(SHADER_CYCLES_LO)
6680 // hi2 = getreg(SHADER_CYCLES_HI)
6681 //
6682 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6683 // Otherwise there was overflow and the result is hi2:0. In both cases the
6684 // result should represent the actual time at some point during the sequence
6685 // of three getregs.
6686 using namespace AMDGPU::Hwreg;
6687 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6688 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6689 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6690 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6691 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6692 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6693 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6694 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6695 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6696 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6697 .addReg(RegHi1)
6698 .addReg(RegHi2);
6699 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6700 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6701 .addReg(RegLo1)
6702 .addImm(0);
6703 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6704 .add(MI.getOperand(0))
6705 .addReg(RegLo)
6706 .addImm(AMDGPU::sub0)
6707 .addReg(RegHi2)
6708 .addImm(AMDGPU::sub1);
6709 MI.eraseFromParent();
6710 return BB;
6711 }
6712 case AMDGPU::SI_INDIRECT_SRC_V1:
6713 case AMDGPU::SI_INDIRECT_SRC_V2:
6714 case AMDGPU::SI_INDIRECT_SRC_V3:
6715 case AMDGPU::SI_INDIRECT_SRC_V4:
6716 case AMDGPU::SI_INDIRECT_SRC_V5:
6717 case AMDGPU::SI_INDIRECT_SRC_V6:
6718 case AMDGPU::SI_INDIRECT_SRC_V7:
6719 case AMDGPU::SI_INDIRECT_SRC_V8:
6720 case AMDGPU::SI_INDIRECT_SRC_V9:
6721 case AMDGPU::SI_INDIRECT_SRC_V10:
6722 case AMDGPU::SI_INDIRECT_SRC_V11:
6723 case AMDGPU::SI_INDIRECT_SRC_V12:
6724 case AMDGPU::SI_INDIRECT_SRC_V16:
6725 case AMDGPU::SI_INDIRECT_SRC_V32:
6726 return emitIndirectSrc(MI, *BB, *getSubtarget());
6727 case AMDGPU::SI_INDIRECT_DST_V1:
6728 case AMDGPU::SI_INDIRECT_DST_V2:
6729 case AMDGPU::SI_INDIRECT_DST_V3:
6730 case AMDGPU::SI_INDIRECT_DST_V4:
6731 case AMDGPU::SI_INDIRECT_DST_V5:
6732 case AMDGPU::SI_INDIRECT_DST_V6:
6733 case AMDGPU::SI_INDIRECT_DST_V7:
6734 case AMDGPU::SI_INDIRECT_DST_V8:
6735 case AMDGPU::SI_INDIRECT_DST_V9:
6736 case AMDGPU::SI_INDIRECT_DST_V10:
6737 case AMDGPU::SI_INDIRECT_DST_V11:
6738 case AMDGPU::SI_INDIRECT_DST_V12:
6739 case AMDGPU::SI_INDIRECT_DST_V16:
6740 case AMDGPU::SI_INDIRECT_DST_V32:
6741 return emitIndirectDst(MI, *BB, *getSubtarget());
6742 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6743 case AMDGPU::SI_KILL_I1_PSEUDO:
6744 return splitKillBlock(MI, BB);
6745 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6746 Register Dst = MI.getOperand(0).getReg();
6747 const MachineOperand &Src0 = MI.getOperand(1);
6748 const MachineOperand &Src1 = MI.getOperand(2);
6749 Register SrcCond = MI.getOperand(3).getReg();
6750
6751 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6752 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6753 const auto *CondRC = TRI->getWaveMaskRegClass();
6754 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6755
6756 const TargetRegisterClass *Src0RC = Src0.isReg()
6757 ? MRI.getRegClass(Src0.getReg())
6758 : &AMDGPU::VReg_64RegClass;
6759 const TargetRegisterClass *Src1RC = Src1.isReg()
6760 ? MRI.getRegClass(Src1.getReg())
6761 : &AMDGPU::VReg_64RegClass;
6762
6763 const TargetRegisterClass *Src0SubRC =
6764 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6765 const TargetRegisterClass *Src1SubRC =
6766 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6767
6768 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6769 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6770 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6771 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6772
6773 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6774 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6775 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6776 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6777
6778 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6779 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6780 .addImm(0)
6781 .add(Src0Sub0)
6782 .addImm(0)
6783 .add(Src1Sub0)
6784 .addReg(SrcCondCopy);
6785 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6786 .addImm(0)
6787 .add(Src0Sub1)
6788 .addImm(0)
6789 .add(Src1Sub1)
6790 .addReg(SrcCondCopy);
6791
6792 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6793 .addReg(DstLo)
6794 .addImm(AMDGPU::sub0)
6795 .addReg(DstHi)
6796 .addImm(AMDGPU::sub1);
6797 MI.eraseFromParent();
6798 return BB;
6799 }
6800 case AMDGPU::SI_BR_UNDEF: {
6801 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6802 .add(MI.getOperand(0));
6803 Br->getOperand(1).setIsUndef(); // read undef SCC
6804 MI.eraseFromParent();
6805 return BB;
6806 }
6807 case AMDGPU::ADJCALLSTACKUP:
6808 case AMDGPU::ADJCALLSTACKDOWN: {
6810 MachineInstrBuilder MIB(*MF, &MI);
6811 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6812 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6813 return BB;
6814 }
6815 case AMDGPU::SI_CALL_ISEL: {
6816 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6817
6819 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6820
6821 for (const MachineOperand &MO : MI.operands())
6822 MIB.add(MO);
6823
6824 MIB.cloneMemRefs(MI);
6825 MI.eraseFromParent();
6826 return BB;
6827 }
6828 case AMDGPU::V_ADD_CO_U32_e32:
6829 case AMDGPU::V_SUB_CO_U32_e32:
6830 case AMDGPU::V_SUBREV_CO_U32_e32: {
6831 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6832 unsigned Opc = MI.getOpcode();
6833
6834 bool NeedClampOperand = false;
6835 if (TII->pseudoToMCOpcode(Opc) == -1) {
6837 NeedClampOperand = true;
6838 }
6839
6840 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6841 if (TII->isVOP3(*I)) {
6842 I.addReg(TRI->getVCC(), RegState::Define);
6843 }
6844 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6845 if (NeedClampOperand)
6846 I.addImm(0); // clamp bit for e64 encoding
6847
6848 TII->legalizeOperands(*I);
6849
6850 MI.eraseFromParent();
6851 return BB;
6852 }
6853 case AMDGPU::V_ADDC_U32_e32:
6854 case AMDGPU::V_SUBB_U32_e32:
6855 case AMDGPU::V_SUBBREV_U32_e32:
6856 // These instructions have an implicit use of vcc which counts towards the
6857 // constant bus limit.
6858 TII->legalizeOperands(MI);
6859 return BB;
6860 case AMDGPU::DS_GWS_INIT:
6861 case AMDGPU::DS_GWS_SEMA_BR:
6862 case AMDGPU::DS_GWS_BARRIER:
6863 case AMDGPU::DS_GWS_SEMA_V:
6864 case AMDGPU::DS_GWS_SEMA_P:
6865 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6866 // A s_waitcnt 0 is required to be the instruction immediately following.
6867 if (getSubtarget()->hasGWSAutoReplay()) {
6869 return BB;
6870 }
6871
6872 return emitGWSMemViolTestLoop(MI, BB);
6873 case AMDGPU::S_SETREG_B32: {
6874 // Try to optimize cases that only set the denormal mode or rounding mode.
6875 //
6876 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6877 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6878 // instead.
6879 //
6880 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6881 // allow you to have a no side effect instruction in the output of a
6882 // sideeffecting pattern.
6883 auto [ID, Offset, Width] =
6884 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6886 return BB;
6887
6888 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6889 const unsigned SetMask = WidthMask << Offset;
6890
6891 if (getSubtarget()->hasDenormModeInst()) {
6892 unsigned SetDenormOp = 0;
6893 unsigned SetRoundOp = 0;
6894
6895 // The dedicated instructions can only set the whole denorm or round mode
6896 // at once, not a subset of bits in either.
6897 if (SetMask ==
6899 // If this fully sets both the round and denorm mode, emit the two
6900 // dedicated instructions for these.
6901 SetRoundOp = AMDGPU::S_ROUND_MODE;
6902 SetDenormOp = AMDGPU::S_DENORM_MODE;
6903 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6904 SetRoundOp = AMDGPU::S_ROUND_MODE;
6905 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6906 SetDenormOp = AMDGPU::S_DENORM_MODE;
6907 }
6908
6909 if (SetRoundOp || SetDenormOp) {
6910 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6911 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6912 unsigned ImmVal = Def->getOperand(1).getImm();
6913 if (SetRoundOp) {
6914 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6915 .addImm(ImmVal & 0xf);
6916
6917 // If we also have the denorm mode, get just the denorm mode bits.
6918 ImmVal >>= 4;
6919 }
6920
6921 if (SetDenormOp) {
6922 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6923 .addImm(ImmVal & 0xf);
6924 }
6925
6926 MI.eraseFromParent();
6927 return BB;
6928 }
6929 }
6930 }
6931
6932 // If only FP bits are touched, used the no side effects pseudo.
6933 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6934 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6935 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6936
6937 return BB;
6938 }
6939 case AMDGPU::S_INVERSE_BALLOT_U32:
6940 case AMDGPU::S_INVERSE_BALLOT_U64:
6941 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6942 // necessary. After that they are equivalent to a COPY.
6943 MI.setDesc(TII->get(AMDGPU::COPY));
6944 return BB;
6945 case AMDGPU::ENDPGM_TRAP: {
6946 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6947 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6948 MI.addOperand(MachineOperand::CreateImm(0));
6949 return BB;
6950 }
6951
6952 // We need a block split to make the real endpgm a terminator. We also don't
6953 // want to break phis in successor blocks, so we can't just delete to the
6954 // end of the block.
6955
6956 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6958 MF->push_back(TrapBB);
6959 // clang-format off
6960 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6961 .addImm(0);
6962 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6963 .addMBB(TrapBB);
6964 // clang-format on
6965
6966 BB->addSuccessor(TrapBB);
6967 MI.eraseFromParent();
6968 return SplitBB;
6969 }
6970 case AMDGPU::SIMULATED_TRAP: {
6971 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6972 MachineBasicBlock *SplitBB =
6973 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6974 MI.eraseFromParent();
6975 return SplitBB;
6976 }
6977 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6978 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6980
6981 // During ISel, it's difficult to propagate the original EXEC mask to use as
6982 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6983 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6984 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6985 Register OriginalExec = Setup->getOperand(0).getReg();
6986 MF->getRegInfo().clearKillFlags(OriginalExec);
6987 MI.getOperand(0).setReg(OriginalExec);
6988 return BB;
6989 }
6990 default:
6991 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6992 if (!MI.mayStore())
6994 return BB;
6995 }
6997 }
6998}
6999
7001 // This currently forces unfolding various combinations of fsub into fma with
7002 // free fneg'd operands. As long as we have fast FMA (controlled by
7003 // isFMAFasterThanFMulAndFAdd), we should perform these.
7004
7005 // When fma is quarter rate, for f64 where add / sub are at best half rate,
7006 // most of these combines appear to be cycle neutral but save on instruction
7007 // count / code size.
7008 return true;
7009}
7010
7012
7014 EVT VT) const {
7015 if (!VT.isVector()) {
7016 return MVT::i1;
7017 }
7018 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
7019}
7020
7022 // TODO: Should i16 be used always if legal? For now it would force VALU
7023 // shifts.
7024 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
7025}
7026
7028 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
7029 ? Ty.changeElementSize(16)
7030 : Ty.changeElementSize(32);
7031}
7032
7033// Answering this is somewhat tricky and depends on the specific device which
7034// have different rates for fma or all f64 operations.
7035//
7036// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
7037// regardless of which device (although the number of cycles differs between
7038// devices), so it is always profitable for f64.
7039//
7040// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
7041// only on full rate devices. Normally, we should prefer selecting v_mad_f32
7042// which we can always do even without fused FP ops since it returns the same
7043// result as the separate operations and since it is always full
7044// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
7045// however does not support denormals, so we do report fma as faster if we have
7046// a fast fma device and require denormals.
7047//
7049 EVT VT) const {
7050 VT = VT.getScalarType();
7051
7052 switch (VT.getSimpleVT().SimpleTy) {
7053 case MVT::f32: {
7054 // If mad is not available this depends only on if f32 fma is full rate.
7055 if (!Subtarget->hasMadMacF32Insts())
7056 return Subtarget->hasFastFMAF32();
7057
7058 // Otherwise f32 mad is always full rate and returns the same result as
7059 // the separate operations so should be preferred over fma.
7060 // However does not support denormals.
7062 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
7063
7064 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
7065 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
7066 }
7067 case MVT::f64:
7068 return true;
7069 case MVT::f16:
7070 case MVT::bf16:
7071 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
7072 default:
7073 break;
7074 }
7075
7076 return false;
7077}
7078
7080 LLT Ty) const {
7081 switch (Ty.getScalarSizeInBits()) {
7082 case 16:
7083 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
7084 case 32:
7085 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
7086 case 64:
7087 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
7088 default:
7089 break;
7090 }
7091
7092 return false;
7093}
7094
7096 if (!Ty.isScalar())
7097 return false;
7098
7099 if (Ty.getScalarSizeInBits() == 16)
7100 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
7101 if (Ty.getScalarSizeInBits() == 32)
7102 return Subtarget->hasMadMacF32Insts() &&
7103 denormalModeIsFlushAllF32(*MI.getMF());
7104
7105 return false;
7106}
7107
7109 const SDNode *N) const {
7110 // TODO: Check future ftz flag
7111 // v_mad_f32/v_mac_f32 do not support denormals.
7112 EVT VT = N->getValueType(0);
7113 if (VT == MVT::f32)
7114 return Subtarget->hasMadMacF32Insts() &&
7116 if (VT == MVT::f16) {
7117 return Subtarget->hasMadF16() &&
7119 }
7120
7121 return false;
7122}
7123
7124//===----------------------------------------------------------------------===//
7125// Custom DAG Lowering Operations
7126//===----------------------------------------------------------------------===//
7127
7128// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
7129// wider vector type is legal.
7131 SelectionDAG &DAG) const {
7132 unsigned Opc = Op.getOpcode();
7133 EVT VT = Op.getValueType();
7134 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7135 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7136 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7137 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7138 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7139 VT == MVT::v32bf16);
7140
7141 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7142
7143 SDLoc SL(Op);
7144 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
7145 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
7146
7147 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7148}
7149
7150// Enable lowering of ROTR for vxi32 types. This is a workaround for a
7151// regression whereby extra unnecessary instructions were added to codegen
7152// for rotr operations, casued by legalising v2i32 or. This resulted in extra
7153// instructions to extract the result from the vector.
7155 [[maybe_unused]] EVT VT = Op.getValueType();
7156
7157 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
7158 VT == MVT::v16i32) &&
7159 "Unexpected ValueType.");
7160
7161 return DAG.UnrollVectorOp(Op.getNode());
7162}
7163
7164// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
7165// wider vector type is legal.
7167 SelectionDAG &DAG) const {
7168 unsigned Opc = Op.getOpcode();
7169 EVT VT = Op.getValueType();
7170 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7171 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7172 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7173 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7174 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7175 VT == MVT::v32bf16);
7176
7177 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
7178 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
7179
7180 SDLoc SL(Op);
7181
7182 SDValue OpLo =
7183 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
7184 SDValue OpHi =
7185 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
7186
7187 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7188}
7189
7191 SelectionDAG &DAG) const {
7192 unsigned Opc = Op.getOpcode();
7193 EVT VT = Op.getValueType();
7194 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7195 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
7196 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7197 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
7198 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
7199 VT == MVT::v32bf16);
7200
7201 SDValue Op0 = Op.getOperand(0);
7202 auto [Lo0, Hi0] = Op0.getValueType().isVector()
7203 ? DAG.SplitVectorOperand(Op.getNode(), 0)
7204 : std::pair(Op0, Op0);
7205
7206 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
7207 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
7208
7209 SDLoc SL(Op);
7210 auto ResVT = DAG.GetSplitDestVTs(VT);
7211
7212 SDValue OpLo =
7213 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
7214 SDValue OpHi =
7215 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
7216
7217 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7218}
7219
7221 switch (Op.getOpcode()) {
7222 default:
7224 case ISD::BRCOND:
7225 return LowerBRCOND(Op, DAG);
7226 case ISD::RETURNADDR:
7227 return LowerRETURNADDR(Op, DAG);
7228 case ISD::SPONENTRY:
7229 return LowerSPONENTRY(Op, DAG);
7230 case ISD::LOAD: {
7231 SDValue Result = LowerLOAD(Op, DAG);
7232 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7233 "Load should return a value and a chain");
7234 return Result;
7235 }
7236 case ISD::FSQRT: {
7237 EVT VT = Op.getValueType();
7238 if (VT == MVT::f32)
7239 return lowerFSQRTF32(Op, DAG);
7240 if (VT == MVT::f64)
7241 return lowerFSQRTF64(Op, DAG);
7242 return SDValue();
7243 }
7244 case ISD::FSIN:
7245 case ISD::FCOS:
7246 return LowerTrig(Op, DAG);
7247 case ISD::SELECT:
7248 return LowerSELECT(Op, DAG);
7249 case ISD::FDIV:
7250 return LowerFDIV(Op, DAG);
7251 case ISD::FFREXP:
7252 return LowerFFREXP(Op, DAG);
7254 return LowerATOMIC_CMP_SWAP(Op, DAG);
7255 case ISD::STORE:
7256 return LowerSTORE(Op, DAG);
7257 case ISD::GlobalAddress: {
7260 return LowerGlobalAddress(MFI, Op, DAG);
7261 }
7263 return LowerExternalSymbol(Op, DAG);
7265 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7267 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7269 return LowerINTRINSIC_VOID(Op, DAG);
7270 case ISD::ADDRSPACECAST:
7271 return lowerADDRSPACECAST(Op, DAG);
7273 return lowerINSERT_SUBVECTOR(Op, DAG);
7275 return lowerINSERT_VECTOR_ELT(Op, DAG);
7277 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7279 return lowerVECTOR_SHUFFLE(Op, DAG);
7281 return lowerSCALAR_TO_VECTOR(Op, DAG);
7282 case ISD::BUILD_VECTOR:
7283 return lowerBUILD_VECTOR(Op, DAG);
7284 case ISD::FP_ROUND:
7286 return lowerFP_ROUND(Op, DAG);
7287 case ISD::TRAP:
7288 return lowerTRAP(Op, DAG);
7289 case ISD::DEBUGTRAP:
7290 return lowerDEBUGTRAP(Op, DAG);
7291 case ISD::ABS:
7292 case ISD::FABS:
7293 case ISD::FNEG:
7294 case ISD::FCANONICALIZE:
7295 case ISD::BSWAP:
7296 return splitUnaryVectorOp(Op, DAG);
7297 case ISD::FMINNUM:
7298 case ISD::FMAXNUM:
7299 return lowerFMINNUM_FMAXNUM(Op, DAG);
7300 case ISD::FMINIMUMNUM:
7301 case ISD::FMAXIMUMNUM:
7302 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7303 case ISD::FMINIMUM:
7304 case ISD::FMAXIMUM:
7305 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7306 case ISD::FLDEXP:
7307 case ISD::STRICT_FLDEXP:
7308 return lowerFLDEXP(Op, DAG);
7309 case ISD::FMA:
7310 return splitTernaryVectorOp(Op, DAG);
7311 case ISD::FP_TO_SINT:
7312 case ISD::FP_TO_UINT:
7313 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
7314 Op.getValueType() == MVT::i16 &&
7315 Op.getOperand(0).getValueType() == MVT::f32) {
7316 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7317 return Op;
7318 }
7319 return LowerFP_TO_INT(Op, DAG);
7320 case ISD::SHL:
7321 case ISD::SRA:
7322 case ISD::SRL:
7323 case ISD::ADD:
7324 case ISD::SUB:
7325 case ISD::SMIN:
7326 case ISD::SMAX:
7327 case ISD::UMIN:
7328 case ISD::UMAX:
7329 case ISD::FADD:
7330 case ISD::FMUL:
7331 case ISD::FMINNUM_IEEE:
7332 case ISD::FMAXNUM_IEEE:
7333 case ISD::UADDSAT:
7334 case ISD::USUBSAT:
7335 case ISD::SADDSAT:
7336 case ISD::SSUBSAT:
7337 return splitBinaryVectorOp(Op, DAG);
7338 case ISD::FCOPYSIGN:
7339 return lowerFCOPYSIGN(Op, DAG);
7340 case ISD::MUL:
7341 return lowerMUL(Op, DAG);
7342 case ISD::SMULO:
7343 case ISD::UMULO:
7344 return lowerXMULO(Op, DAG);
7345 case ISD::SMUL_LOHI:
7346 case ISD::UMUL_LOHI:
7347 return lowerXMUL_LOHI(Op, DAG);
7349 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7350 case ISD::STACKSAVE:
7351 return LowerSTACKSAVE(Op, DAG);
7352 case ISD::GET_ROUNDING:
7353 return lowerGET_ROUNDING(Op, DAG);
7354 case ISD::SET_ROUNDING:
7355 return lowerSET_ROUNDING(Op, DAG);
7356 case ISD::PREFETCH:
7357 return lowerPREFETCH(Op, DAG);
7358 case ISD::FP_EXTEND:
7360 return lowerFP_EXTEND(Op, DAG);
7361 case ISD::GET_FPENV:
7362 return lowerGET_FPENV(Op, DAG);
7363 case ISD::SET_FPENV:
7364 return lowerSET_FPENV(Op, DAG);
7365 case ISD::ROTR:
7366 return lowerROTR(Op, DAG);
7367 }
7368 return SDValue();
7369}
7370
7371// Used for D16: Casts the result of an instruction into the right vector,
7372// packs values if loads return unpacked values.
7374 const SDLoc &DL, SelectionDAG &DAG,
7375 bool Unpacked) {
7376 if (!LoadVT.isVector())
7377 return Result;
7378
7379 // Cast back to the original packed type or to a larger type that is a
7380 // multiple of 32 bit for D16. Widening the return type is a required for
7381 // legalization.
7382 EVT FittingLoadVT = LoadVT;
7383 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7384 FittingLoadVT =
7386 LoadVT.getVectorNumElements() + 1);
7387 }
7388
7389 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7390 // Truncate to v2i16/v4i16.
7391 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7392
7393 // Workaround legalizer not scalarizing truncate after vector op
7394 // legalization but not creating intermediate vector trunc.
7396 DAG.ExtractVectorElements(Result, Elts);
7397 for (SDValue &Elt : Elts)
7398 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7399
7400 // Pad illegal v1i16/v3fi6 to v4i16
7401 if ((LoadVT.getVectorNumElements() % 2) == 1)
7402 Elts.push_back(DAG.getPOISON(MVT::i16));
7403
7404 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
7405
7406 // Bitcast to original type (v2f16/v4f16).
7407 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7408 }
7409
7410 // Cast back to the original packed type.
7411 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7412}
7413
7414SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7415 SelectionDAG &DAG,
7417 bool IsIntrinsic) const {
7418 SDLoc DL(M);
7419
7420 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7421 EVT LoadVT = M->getValueType(0);
7422
7423 EVT EquivLoadVT = LoadVT;
7424 if (LoadVT.isVector()) {
7425 if (Unpacked) {
7426 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7427 LoadVT.getVectorNumElements());
7428 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7429 // Widen v3f16 to legal type
7430 EquivLoadVT =
7432 LoadVT.getVectorNumElements() + 1);
7433 }
7434 }
7435
7436 // Change from v4f16/v2f16 to EquivLoadVT.
7437 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7438
7440 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7441 M->getMemoryVT(), M->getMemOperand());
7442
7443 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7444
7445 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7446}
7447
7448SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7449 SelectionDAG &DAG,
7450 ArrayRef<SDValue> Ops) const {
7451 SDLoc DL(M);
7452 EVT LoadVT = M->getValueType(0);
7453 EVT EltType = LoadVT.getScalarType();
7454 EVT IntVT = LoadVT.changeTypeToInteger();
7455
7456 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7457
7458 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7459 bool IsTFE = M->getNumValues() == 3;
7460
7461 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7462 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7463 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7464 : AMDGPUISD::BUFFER_LOAD;
7465
7466 if (IsD16) {
7467 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7468 }
7469
7470 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7471 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7472 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7473 IsTFE);
7474
7475 if (isTypeLegal(LoadVT)) {
7476 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7477 M->getMemOperand(), DAG);
7478 }
7479
7480 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7481 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7482 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7483 M->getMemOperand(), DAG);
7484 return DAG.getMergeValues(
7485 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7486 DL);
7487}
7488
7490 SelectionDAG &DAG) {
7491 EVT VT = N->getValueType(0);
7492 unsigned CondCode = N->getConstantOperandVal(3);
7493 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7494 return DAG.getPOISON(VT);
7495
7496 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7497
7498 SDValue LHS = N->getOperand(1);
7499 SDValue RHS = N->getOperand(2);
7500
7501 SDLoc DL(N);
7502
7503 EVT CmpVT = LHS.getValueType();
7504 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7505 unsigned PromoteOp =
7507 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7508 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7509 }
7510
7511 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7512
7513 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7514 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7515
7516 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7517 DAG.getCondCode(CCOpcode));
7518 if (VT.bitsEq(CCVT))
7519 return SetCC;
7520 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7521}
7522
7524 SelectionDAG &DAG) {
7525 EVT VT = N->getValueType(0);
7526
7527 unsigned CondCode = N->getConstantOperandVal(3);
7528 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7529 return DAG.getPOISON(VT);
7530
7531 SDValue Src0 = N->getOperand(1);
7532 SDValue Src1 = N->getOperand(2);
7533 EVT CmpVT = Src0.getValueType();
7534 SDLoc SL(N);
7535
7536 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7537 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7538 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7539 }
7540
7541 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7542 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7543 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7544 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7545 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7546 DAG.getCondCode(CCOpcode));
7547 if (VT.bitsEq(CCVT))
7548 return SetCC;
7549 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7550}
7551
7553 SelectionDAG &DAG) {
7554 EVT VT = N->getValueType(0);
7555 SDValue Src = N->getOperand(1);
7556 SDLoc SL(N);
7557
7558 if (Src.getOpcode() == ISD::SETCC) {
7559 SDValue Op0 = Src.getOperand(0);
7560 SDValue Op1 = Src.getOperand(1);
7561 // Need to expand bfloat to float for comparison (setcc).
7562 if (Op0.getValueType() == MVT::bf16) {
7563 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7564 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7565 }
7566 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7567 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7568 }
7569 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7570 // (ballot 0) -> 0
7571 if (Arg->isZero())
7572 return DAG.getConstant(0, SL, VT);
7573
7574 // (ballot 1) -> EXEC/EXEC_LO
7575 if (Arg->isOne()) {
7576 Register Exec;
7577 if (VT.getScalarSizeInBits() == 32)
7578 Exec = AMDGPU::EXEC_LO;
7579 else if (VT.getScalarSizeInBits() == 64)
7580 Exec = AMDGPU::EXEC;
7581 else
7582 return SDValue();
7583
7584 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7585 }
7586 }
7587
7588 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7589 // ISD::SETNE)
7590 return DAG.getNode(
7591 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7592 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7593}
7594
7596 SelectionDAG &DAG) {
7597 EVT VT = N->getValueType(0);
7598 unsigned ValSize = VT.getSizeInBits();
7599 unsigned IID = N->getConstantOperandVal(0);
7600 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7601 IID == Intrinsic::amdgcn_permlanex16;
7602 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7603 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7604 SDLoc SL(N);
7605 MVT IntVT = MVT::getIntegerVT(ValSize);
7606 const GCNSubtarget *ST = TLI.getSubtarget();
7607 unsigned SplitSize = 32;
7608 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7609 ST->hasDPALU_DPP() &&
7610 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7611 SplitSize = 64;
7612
7613 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7614 SDValue Src2, MVT ValT) -> SDValue {
7615 SmallVector<SDValue, 8> Operands;
7616 switch (IID) {
7617 case Intrinsic::amdgcn_permlane16:
7618 case Intrinsic::amdgcn_permlanex16:
7619 case Intrinsic::amdgcn_update_dpp:
7620 Operands.push_back(N->getOperand(6));
7621 Operands.push_back(N->getOperand(5));
7622 Operands.push_back(N->getOperand(4));
7623 [[fallthrough]];
7624 case Intrinsic::amdgcn_writelane:
7625 Operands.push_back(Src2);
7626 [[fallthrough]];
7627 case Intrinsic::amdgcn_readlane:
7628 case Intrinsic::amdgcn_set_inactive:
7629 case Intrinsic::amdgcn_set_inactive_chain_arg:
7630 case Intrinsic::amdgcn_mov_dpp8:
7631 Operands.push_back(Src1);
7632 [[fallthrough]];
7633 case Intrinsic::amdgcn_readfirstlane:
7634 case Intrinsic::amdgcn_permlane64:
7635 Operands.push_back(Src0);
7636 break;
7637 default:
7638 llvm_unreachable("unhandled lane op");
7639 }
7640
7641 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7642 std::reverse(Operands.begin(), Operands.end());
7643
7644 if (SDNode *GL = N->getGluedNode()) {
7645 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7646 GL = GL->getOperand(0).getNode();
7647 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7648 SDValue(GL, 0)));
7649 }
7650
7651 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7652 };
7653
7654 SDValue Src0 = N->getOperand(1);
7655 SDValue Src1, Src2;
7656 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7657 IID == Intrinsic::amdgcn_mov_dpp8 ||
7658 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7659 Src1 = N->getOperand(2);
7660 if (IID == Intrinsic::amdgcn_writelane ||
7661 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7662 Src2 = N->getOperand(3);
7663 }
7664
7665 if (ValSize == SplitSize) {
7666 // Already legal
7667 return SDValue();
7668 }
7669
7670 if (ValSize < 32) {
7671 bool IsFloat = VT.isFloatingPoint();
7672 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7673 SL, MVT::i32);
7674
7675 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7676 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7677 SL, MVT::i32);
7678 }
7679
7680 if (IID == Intrinsic::amdgcn_writelane) {
7681 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7682 SL, MVT::i32);
7683 }
7684
7685 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7686 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7687 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7688 }
7689
7690 if (ValSize % SplitSize != 0)
7691 return SDValue();
7692
7693 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7694 EVT VT = N->getValueType(0);
7695 unsigned NE = VT.getVectorNumElements();
7696 EVT EltVT = VT.getVectorElementType();
7698 unsigned NumOperands = N->getNumOperands();
7699 SmallVector<SDValue, 4> Operands(NumOperands);
7700 SDNode *GL = N->getGluedNode();
7701
7702 // only handle convergencectrl_glue
7704
7705 for (unsigned i = 0; i != NE; ++i) {
7706 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7707 ++j) {
7708 SDValue Operand = N->getOperand(j);
7709 EVT OperandVT = Operand.getValueType();
7710 if (OperandVT.isVector()) {
7711 // A vector operand; extract a single element.
7712 EVT OperandEltVT = OperandVT.getVectorElementType();
7713 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7714 Operand, DAG.getVectorIdxConstant(i, SL));
7715 } else {
7716 // A scalar operand; just use it as is.
7717 Operands[j] = Operand;
7718 }
7719 }
7720
7721 if (GL)
7722 Operands[NumOperands - 1] =
7723 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7724 SDValue(GL->getOperand(0).getNode(), 0));
7725
7726 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7727 }
7728
7729 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7730 return DAG.getBuildVector(VecVT, SL, Scalars);
7731 };
7732
7733 if (VT.isVector()) {
7734 switch (MVT::SimpleValueType EltTy =
7736 case MVT::i32:
7737 case MVT::f32:
7738 if (SplitSize == 32) {
7739 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7740 return unrollLaneOp(LaneOp.getNode());
7741 }
7742 [[fallthrough]];
7743 case MVT::i16:
7744 case MVT::f16:
7745 case MVT::bf16: {
7746 unsigned SubVecNumElt =
7747 SplitSize / VT.getVectorElementType().getSizeInBits();
7748 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7750 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7751 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7752 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7753 DAG.getConstant(EltIdx, SL, MVT::i32));
7754
7755 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7756 IsPermLane16)
7757 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7758 DAG.getConstant(EltIdx, SL, MVT::i32));
7759
7760 if (IID == Intrinsic::amdgcn_writelane)
7761 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7762 DAG.getConstant(EltIdx, SL, MVT::i32));
7763
7764 Pieces.push_back(
7765 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7766 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7767 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7768 EltIdx += SubVecNumElt;
7769 }
7770 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7771 }
7772 default:
7773 // Handle all other cases by bitcasting to i32 vectors
7774 break;
7775 }
7776 }
7777
7778 MVT VecVT =
7779 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7780 Src0 = DAG.getBitcast(VecVT, Src0);
7781
7782 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7783 Src1 = DAG.getBitcast(VecVT, Src1);
7784
7785 if (IID == Intrinsic::amdgcn_writelane)
7786 Src2 = DAG.getBitcast(VecVT, Src2);
7787
7788 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7789 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7790 return DAG.getBitcast(VT, UnrolledLaneOp);
7791}
7792
7794 SelectionDAG &DAG) {
7795 EVT VT = N->getValueType(0);
7796
7797 if (VT.getSizeInBits() != 32)
7798 return SDValue();
7799
7800 SDLoc SL(N);
7801
7802 SDValue Value = N->getOperand(1);
7803 SDValue Index = N->getOperand(2);
7804
7805 // ds_bpermute requires index to be multiplied by 4
7806 SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
7807 SDValue ShiftedIndex =
7808 DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
7809
7810 // Intrinsics will require i32 to operate on
7811 SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
7812
7813 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7814 SmallVector<SDValue> IntrinArgs) -> SDValue {
7815 SmallVector<SDValue> Operands(1);
7816 Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
7817 Operands.append(IntrinArgs);
7818 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
7819 };
7820
7821 // If we can bpermute across the whole wave, then just do that
7823 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7824 {ShiftedIndex, ValueI32});
7825 return DAG.getBitcast(VT, BPermute);
7826 }
7827
7828 assert(TLI.getSubtarget()->isWave64());
7829
7830 // Otherwise, we need to make use of whole wave mode
7831 SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
7832
7833 // Set inactive lanes to poison
7834 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7835 {ValueI32, PoisonVal});
7836 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7837 {ShiftedIndex, PoisonVal});
7838
7839 SDValue Swapped =
7840 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7841
7842 // Get permutation of each half, then we'll select which one to use
7843 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7844 {WWMIndex, WWMValue});
7845 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7846 MVT::i32, {WWMIndex, Swapped});
7847 SDValue BPermOtherHalfWWM =
7848 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7849
7850 // Select which side to take the permute from
7851 SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
7852 // We can get away with only using mbcnt_lo here since we're only
7853 // trying to detect which side of 32 each lane is on, and mbcnt_lo
7854 // returns 32 for lanes 32-63.
7855 SDValue ThreadID =
7856 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7857 {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7858
7859 SDValue SameOrOtherHalf =
7860 DAG.getNode(ISD::AND, SL, MVT::i32,
7861 DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7862 DAG.getTargetConstant(32, SL, MVT::i32));
7863 SDValue UseSameHalf =
7864 DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7865 DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7866 SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7867 BPermOtherHalfWWM);
7868 return DAG.getBitcast(VT, Result);
7869}
7870
7873 SelectionDAG &DAG) const {
7874 switch (N->getOpcode()) {
7876 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7877 Results.push_back(Res);
7878 return;
7879 }
7881 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7882 Results.push_back(Res);
7883 return;
7884 }
7886 unsigned IID = N->getConstantOperandVal(0);
7887 switch (IID) {
7888 case Intrinsic::amdgcn_make_buffer_rsrc:
7889 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7890 return;
7891 case Intrinsic::amdgcn_cvt_pkrtz: {
7892 SDValue Src0 = N->getOperand(1);
7893 SDValue Src1 = N->getOperand(2);
7894 SDLoc SL(N);
7895 SDValue Cvt =
7896 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7897 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7898 return;
7899 }
7900 case Intrinsic::amdgcn_cvt_pknorm_i16:
7901 case Intrinsic::amdgcn_cvt_pknorm_u16:
7902 case Intrinsic::amdgcn_cvt_pk_i16:
7903 case Intrinsic::amdgcn_cvt_pk_u16: {
7904 SDValue Src0 = N->getOperand(1);
7905 SDValue Src1 = N->getOperand(2);
7906 SDLoc SL(N);
7907 unsigned Opcode;
7908
7909 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7910 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7911 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7912 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7913 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7914 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7915 else
7916 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7917
7918 EVT VT = N->getValueType(0);
7919 if (isTypeLegal(VT))
7920 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7921 else {
7922 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7923 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7924 }
7925 return;
7926 }
7927 case Intrinsic::amdgcn_s_buffer_load: {
7928 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7929 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7930 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7931 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7932 // s_buffer_load_i8.
7933 if (!Subtarget->hasScalarSubwordLoads())
7934 return;
7935 SDValue Op = SDValue(N, 0);
7936 SDValue Rsrc = Op.getOperand(1);
7937 SDValue Offset = Op.getOperand(2);
7938 SDValue CachePolicy = Op.getOperand(3);
7939 EVT VT = Op.getValueType();
7940 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7941 SDLoc DL(Op);
7943 const DataLayout &DataLayout = DAG.getDataLayout();
7944 Align Alignment =
7950 VT.getStoreSize(), Alignment);
7951 SDValue LoadVal;
7952 if (!Offset->isDivergent()) {
7953 SDValue Ops[] = {Rsrc, // source register
7954 Offset, CachePolicy};
7955 SDValue BufferLoad =
7956 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7957 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7958 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7959 } else {
7960 SDValue Ops[] = {
7961 DAG.getEntryNode(), // Chain
7962 Rsrc, // rsrc
7963 DAG.getConstant(0, DL, MVT::i32), // vindex
7964 {}, // voffset
7965 {}, // soffset
7966 {}, // offset
7967 CachePolicy, // cachepolicy
7968 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7969 };
7970 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7971 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7972 }
7973 Results.push_back(LoadVal);
7974 return;
7975 }
7976 case Intrinsic::amdgcn_dead: {
7977 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7978 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7979 return;
7980 }
7981 }
7982 break;
7983 }
7985 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7986 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7987 // FIXME: Hacky
7988 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7989 Results.push_back(Res.getOperand(I));
7990 }
7991 } else {
7992 Results.push_back(Res);
7993 Results.push_back(Res.getValue(1));
7994 }
7995 return;
7996 }
7997
7998 break;
7999 }
8000 case ISD::SELECT: {
8001 SDLoc SL(N);
8002 EVT VT = N->getValueType(0);
8003 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
8004 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
8005 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
8006
8007 EVT SelectVT = NewVT;
8008 if (NewVT.bitsLT(MVT::i32)) {
8009 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
8010 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
8011 SelectVT = MVT::i32;
8012 }
8013
8014 SDValue NewSelect =
8015 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
8016
8017 if (NewVT != SelectVT)
8018 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
8019 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
8020 return;
8021 }
8022 case ISD::FNEG: {
8023 if (N->getValueType(0) != MVT::v2f16)
8024 break;
8025
8026 SDLoc SL(N);
8027 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
8028
8029 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
8030 DAG.getConstant(0x80008000, SL, MVT::i32));
8031 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
8032 return;
8033 }
8034 case ISD::FABS: {
8035 if (N->getValueType(0) != MVT::v2f16)
8036 break;
8037
8038 SDLoc SL(N);
8039 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
8040
8041 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
8042 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
8043 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
8044 return;
8045 }
8046 case ISD::FSQRT: {
8047 if (N->getValueType(0) != MVT::f16)
8048 break;
8049 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
8050 break;
8051 }
8052 default:
8054 break;
8055 }
8056}
8057
8058/// Helper function for LowerBRCOND
8059static SDNode *findUser(SDValue Value, unsigned Opcode) {
8060
8061 for (SDUse &U : Value->uses()) {
8062 if (U.get() != Value)
8063 continue;
8064
8065 if (U.getUser()->getOpcode() == Opcode)
8066 return U.getUser();
8067 }
8068 return nullptr;
8069}
8070
8071unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
8072 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
8073 switch (Intr->getConstantOperandVal(1)) {
8074 case Intrinsic::amdgcn_if:
8075 return AMDGPUISD::IF;
8076 case Intrinsic::amdgcn_else:
8077 return AMDGPUISD::ELSE;
8078 case Intrinsic::amdgcn_loop:
8079 return AMDGPUISD::LOOP;
8080 case Intrinsic::amdgcn_end_cf:
8081 llvm_unreachable("should not occur");
8082 default:
8083 return 0;
8084 }
8085 }
8086
8087 // break, if_break, else_break are all only used as inputs to loop, not
8088 // directly as branch conditions.
8089 return 0;
8090}
8091
8098
8100 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
8101 return false;
8102
8103 // FIXME: Either avoid relying on address space here or change the default
8104 // address space for functions to avoid the explicit check.
8105 return (GV->getValueType()->isFunctionTy() ||
8108}
8109
8111 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
8112}
8113
8115 if (!GV->hasExternalLinkage())
8116 return true;
8117
8118 const auto OS = getTargetMachine().getTargetTriple().getOS();
8119 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
8120}
8121
8122/// This transforms the control flow intrinsics to get the branch destination as
8123/// last parameter, also switches branch target with BR if the need arise
8124SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
8125 SDLoc DL(BRCOND);
8126
8127 SDNode *Intr = BRCOND.getOperand(1).getNode();
8128 SDValue Target = BRCOND.getOperand(2);
8129 SDNode *BR = nullptr;
8130 SDNode *SetCC = nullptr;
8131
8132 switch (Intr->getOpcode()) {
8133 case ISD::SETCC: {
8134 // As long as we negate the condition everything is fine
8135 SetCC = Intr;
8136 Intr = SetCC->getOperand(0).getNode();
8137 break;
8138 }
8139 case ISD::XOR: {
8140 // Similar to SETCC, if we have (xor c, -1), we will be fine.
8141 SDValue LHS = Intr->getOperand(0);
8142 SDValue RHS = Intr->getOperand(1);
8143 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
8144 Intr = LHS.getNode();
8145 break;
8146 }
8147 [[fallthrough]];
8148 }
8149 default: {
8150 // Get the target from BR if we don't negate the condition
8151 BR = findUser(BRCOND, ISD::BR);
8152 assert(BR && "brcond missing unconditional branch user");
8153 Target = BR->getOperand(1);
8154 }
8155 }
8156
8157 unsigned CFNode = isCFIntrinsic(Intr);
8158 if (CFNode == 0) {
8159 // This is a uniform branch so we don't need to legalize.
8160 return BRCOND;
8161 }
8162
8163 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
8165
8166 assert(!SetCC ||
8167 (SetCC->getConstantOperandVal(1) == 1 &&
8168 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
8169 ISD::SETNE));
8170
8171 // operands of the new intrinsic call
8173 if (HaveChain)
8174 Ops.push_back(BRCOND.getOperand(0));
8175
8176 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
8177 Ops.push_back(Target);
8178
8179 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
8180
8181 // build the new intrinsic call
8182 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
8183
8184 if (!HaveChain) {
8185 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
8186
8188 }
8189
8190 if (BR) {
8191 // Give the branch instruction our target
8192 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
8193 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
8194 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
8195 }
8196
8197 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
8198
8199 // Copy the intrinsic results to registers
8200 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
8201 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
8202 if (!CopyToReg)
8203 continue;
8204
8205 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
8206 SDValue(Result, i - 1), SDValue());
8207
8208 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
8209 }
8210
8211 // Remove the old intrinsic from the chain
8212 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
8213 Intr->getOperand(0));
8214
8215 return Chain;
8216}
8217
8218SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
8219 MVT VT = Op.getSimpleValueType();
8220 SDLoc DL(Op);
8221 // Checking the depth
8222 if (Op.getConstantOperandVal(0) != 0)
8223 return DAG.getConstant(0, DL, VT);
8224
8225 MachineFunction &MF = DAG.getMachineFunction();
8226 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8227 // Check for kernel and shader functions
8228 if (Info->isEntryFunction())
8229 return DAG.getConstant(0, DL, VT);
8230
8231 MachineFrameInfo &MFI = MF.getFrameInfo();
8232 // There is a call to @llvm.returnaddress in this function
8233 MFI.setReturnAddressIsTaken(true);
8234
8235 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
8236 // Get the return address reg and mark it as an implicit live-in
8237 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
8238 getRegClassFor(VT, Op.getNode()->isDivergent()));
8239
8240 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
8241}
8242
8243SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8244 MachineFunction &MF = DAG.getMachineFunction();
8245 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8246
8247 // For functions that set up their own stack, select the GET_STACK_BASE
8248 // pseudo.
8249 if (MFI->isBottomOfStack())
8250 return Op;
8251
8252 // For everything else, create a dummy stack object.
8253 int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);
8254 return DAG.getFrameIndex(FI, Op.getValueType());
8255}
8256
8257SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8258 const SDLoc &DL, EVT VT) const {
8259 return Op.getValueType().bitsLE(VT)
8260 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
8261 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
8262 DAG.getTargetConstant(0, DL, MVT::i32));
8263}
8264
8265SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8266 SelectionDAG &DAG) const {
8267 EVT DstVT = Op.getValueType();
8268 unsigned NumElts = DstVT.getVectorNumElements();
8269 assert(NumElts > 2 && isPowerOf2_32(NumElts));
8270
8271 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
8272
8273 SDLoc DL(Op);
8274 unsigned Opc = Op.getOpcode();
8275 SDValue Flags = Op.getOperand(1);
8276 EVT HalfDstVT =
8277 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
8278 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
8279 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
8280
8281 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
8282}
8283
8284SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8285 SDValue Src = Op.getOperand(0);
8286 EVT SrcVT = Src.getValueType();
8287 EVT DstVT = Op.getValueType();
8288
8289 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
8290 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8291 if (SrcVT.getScalarType() != MVT::f32)
8292 return SDValue();
8293 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8294 }
8295
8296 if (SrcVT.getScalarType() != MVT::f64)
8297 return Op;
8298
8299 SDLoc DL(Op);
8300 if (DstVT == MVT::f16) {
8301 // TODO: Handle strictfp
8302 if (Op.getOpcode() != ISD::FP_ROUND)
8303 return Op;
8304
8305 if (!Subtarget->has16BitInsts()) {
8306 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
8307 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8308 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8309 }
8310 if (Op->getFlags().hasApproximateFuncs()) {
8311 SDValue Flags = Op.getOperand(1);
8312 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
8313 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
8314 }
8315 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8316 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8317 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8318 }
8319
8320 assert(DstVT.getScalarType() == MVT::bf16 &&
8321 "custom lower FP_ROUND for f16 or bf16");
8322 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8323
8324 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8325 // hardware f32 -> bf16 instruction.
8326 EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
8327 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
8328 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
8329 DAG.getTargetConstant(0, DL, MVT::i32));
8330}
8331
8332SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8333 SelectionDAG &DAG) const {
8334 EVT VT = Op.getValueType();
8335 const MachineFunction &MF = DAG.getMachineFunction();
8336 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8337 bool IsIEEEMode = Info->getMode().IEEE;
8338
8339 // FIXME: Assert during selection that this is only selected for
8340 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8341 // mode functions, but this happens to be OK since it's only done in cases
8342 // where there is known no sNaN.
8343 if (IsIEEEMode)
8344 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
8345
8346 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8347 VT == MVT::v16bf16)
8348 return splitBinaryVectorOp(Op, DAG);
8349 return Op;
8350}
8351
8352SDValue
8353SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8354 SelectionDAG &DAG) const {
8355 EVT VT = Op.getValueType();
8356 const MachineFunction &MF = DAG.getMachineFunction();
8357 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8358 bool IsIEEEMode = Info->getMode().IEEE;
8359
8360 if (IsIEEEMode)
8361 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
8362
8363 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8364 VT == MVT::v16bf16)
8365 return splitBinaryVectorOp(Op, DAG);
8366 return Op;
8367}
8368
8369SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8370 SelectionDAG &DAG) const {
8371 EVT VT = Op.getValueType();
8372 if (VT.isVector())
8373 return splitBinaryVectorOp(Op, DAG);
8374
8375 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8376 !Subtarget->hasMinimum3Maximum3F16() &&
8377 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8378 "should not need to widen f16 minimum/maximum to v2f16");
8379
8380 // Widen f16 operation to v2f16
8381
8382 // fminimum f16:x, f16:y ->
8383 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8384 // (v2f16 (scalar_to_vector y))), 0
8385 SDLoc SL(Op);
8386 SDValue WideSrc0 =
8387 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
8388 SDValue WideSrc1 =
8389 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
8390
8391 SDValue Widened =
8392 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8393
8394 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
8395 DAG.getConstant(0, SL, MVT::i32));
8396}
8397
8398SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8399 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8400 EVT VT = Op.getValueType();
8401 assert(VT == MVT::f16);
8402
8403 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
8404 EVT ExpVT = Exp.getValueType();
8405 if (ExpVT == MVT::i16)
8406 return Op;
8407
8408 SDLoc DL(Op);
8409
8410 // Correct the exponent type for f16 to i16.
8411 // Clamp the range of the exponent to the instruction's range.
8412
8413 // TODO: This should be a generic narrowing legalization, and can easily be
8414 // for GlobalISel.
8415
8416 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
8417 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
8418
8419 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
8420 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
8421
8422 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
8423
8424 if (IsStrict) {
8425 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
8426 {Op.getOperand(0), Op.getOperand(1), TruncExp});
8427 }
8428
8429 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
8430}
8431
8433 switch (Op->getOpcode()) {
8434 case ISD::SRA:
8435 case ISD::SMIN:
8436 case ISD::SMAX:
8437 return ISD::SIGN_EXTEND;
8438 case ISD::SRL:
8439 case ISD::UMIN:
8440 case ISD::UMAX:
8441 return ISD::ZERO_EXTEND;
8442 case ISD::ADD:
8443 case ISD::SUB:
8444 case ISD::AND:
8445 case ISD::OR:
8446 case ISD::XOR:
8447 case ISD::SHL:
8448 case ISD::SELECT:
8449 case ISD::MUL:
8450 // operation result won't be influenced by garbage high bits.
8451 // TODO: are all of those cases correct, and are there more?
8452 return ISD::ANY_EXTEND;
8453 case ISD::SETCC: {
8454 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8456 }
8457 default:
8458 llvm_unreachable("unexpected opcode!");
8459 }
8460}
8461
8462SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8463 DAGCombinerInfo &DCI) const {
8464 const unsigned Opc = Op.getOpcode();
8465 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8466 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8467 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8468 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8469 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8470
8471 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8472 : Op->getOperand(0).getValueType();
8473 auto &DAG = DCI.DAG;
8474 auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8475
8476 if (DCI.isBeforeLegalizeOps() ||
8477 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8478 return SDValue();
8479
8480 SDLoc DL(Op);
8481 SDValue LHS;
8482 SDValue RHS;
8483 if (Opc == ISD::SELECT) {
8484 LHS = Op->getOperand(1);
8485 RHS = Op->getOperand(2);
8486 } else {
8487 LHS = Op->getOperand(0);
8488 RHS = Op->getOperand(1);
8489 }
8490
8491 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8492 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
8493
8494 // Special case: for shifts, the RHS always needs a zext.
8495 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8496 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
8497 else
8498 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
8499
8500 // setcc always return i1/i1 vec so no need to truncate after.
8501 if (Opc == ISD::SETCC) {
8502 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8503 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8504 }
8505
8506 // For other ops, we extend the operation's return type as well so we need to
8507 // truncate back to the original type.
8508 SDValue NewVal;
8509 if (Opc == ISD::SELECT)
8510 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8511 else
8512 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8513
8514 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8515}
8516
8517SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8518 SDValue Mag = Op.getOperand(0);
8519 EVT MagVT = Mag.getValueType();
8520
8521 if (MagVT.getVectorNumElements() > 2)
8522 return splitBinaryVectorOp(Op, DAG);
8523
8524 SDValue Sign = Op.getOperand(1);
8525 EVT SignVT = Sign.getValueType();
8526
8527 if (MagVT == SignVT)
8528 return Op;
8529
8530 // fcopysign v2f16:mag, v2f32:sign ->
8531 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8532
8533 SDLoc SL(Op);
8534 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8535 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8536
8537 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8538
8539 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8540}
8541
8542// Custom lowering for vector multiplications and s_mul_u64.
8543SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8544 EVT VT = Op.getValueType();
8545
8546 // Split vector operands.
8547 if (VT.isVector())
8548 return splitBinaryVectorOp(Op, DAG);
8549
8550 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8551
8552 // There are four ways to lower s_mul_u64:
8553 //
8554 // 1. If all the operands are uniform, then we lower it as it is.
8555 //
8556 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8557 // multiplications because there is not a vector equivalent of s_mul_u64.
8558 //
8559 // 3. If the cost model decides that it is more efficient to use vector
8560 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8561 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8562 //
8563 // 4. If the cost model decides to use vector registers and both of the
8564 // operands are zero-extended/sign-extended from 32-bits, then we split the
8565 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8566 // possible to check if the operands are zero-extended or sign-extended in
8567 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8568 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8569 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8570 // If the cost model decides that we have to use vector registers, then
8571 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8572 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8573 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8574 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8575 // SIInstrInfo.cpp .
8576
8577 if (Op->isDivergent())
8578 return SDValue();
8579
8580 SDValue Op0 = Op.getOperand(0);
8581 SDValue Op1 = Op.getOperand(1);
8582 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8583 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8584 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8585 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8586 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8587 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8588 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8589 SDLoc SL(Op);
8590 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8591 return SDValue(
8592 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8593 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8594 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8595 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8596 return SDValue(
8597 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8598 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8599 return Op;
8600}
8601
8602SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8603 EVT VT = Op.getValueType();
8604 SDLoc SL(Op);
8605 SDValue LHS = Op.getOperand(0);
8606 SDValue RHS = Op.getOperand(1);
8607 bool isSigned = Op.getOpcode() == ISD::SMULO;
8608
8609 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8610 const APInt &C = RHSC->getAPIntValue();
8611 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8612 if (C.isPowerOf2()) {
8613 // smulo(x, signed_min) is same as umulo(x, signed_min).
8614 bool UseArithShift = isSigned && !C.isMinSignedValue();
8615 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8616 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8617 SDValue Overflow =
8618 DAG.getSetCC(SL, MVT::i1,
8619 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8620 Result, ShiftAmt),
8621 LHS, ISD::SETNE);
8622 return DAG.getMergeValues({Result, Overflow}, SL);
8623 }
8624 }
8625
8626 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8627 SDValue Top =
8628 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8629
8630 SDValue Sign = isSigned
8631 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8632 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8633 SL, MVT::i32))
8634 : DAG.getConstant(0, SL, VT);
8635 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8636
8637 return DAG.getMergeValues({Result, Overflow}, SL);
8638}
8639
8640SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8641 if (Op->isDivergent()) {
8642 // Select to V_MAD_[IU]64_[IU]32.
8643 return Op;
8644 }
8645 if (Subtarget->hasSMulHi()) {
8646 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8647 return SDValue();
8648 }
8649 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8650 // calculate the high part, so we might as well do the whole thing with
8651 // V_MAD_[IU]64_[IU]32.
8652 return Op;
8653}
8654
8655SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8656 if (!Subtarget->hasTrapHandler() ||
8657 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8658 return lowerTrapEndpgm(Op, DAG);
8659
8660 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8661 : lowerTrapHsaQueuePtr(Op, DAG);
8662}
8663
8664SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8665 SDLoc SL(Op);
8666 SDValue Chain = Op.getOperand(0);
8667 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8668}
8669
8670SDValue
8671SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8672 const SDLoc &DL, Align Alignment,
8673 ImplicitParameter Param) const {
8674 MachineFunction &MF = DAG.getMachineFunction();
8675 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8676 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8677 MachinePointerInfo PtrInfo =
8679 return DAG.getLoad(
8680 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8682}
8683
8684SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8685 SelectionDAG &DAG) const {
8686 SDLoc SL(Op);
8687 SDValue Chain = Op.getOperand(0);
8688
8689 SDValue QueuePtr;
8690 // For code object version 5, QueuePtr is passed through implicit kernarg.
8691 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8693 QueuePtr =
8694 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8695 } else {
8696 MachineFunction &MF = DAG.getMachineFunction();
8697 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8698 Register UserSGPR = Info->getQueuePtrUserSGPR();
8699
8700 if (UserSGPR == AMDGPU::NoRegister) {
8701 // We probably are in a function incorrectly marked with
8702 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8703 // trap, so just use a null pointer.
8704 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8705 } else {
8706 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8707 MVT::i64);
8708 }
8709 }
8710
8711 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8712 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8713
8714 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8715 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8716 ToReg.getValue(1)};
8717 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8718}
8719
8720SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8721 SDLoc SL(Op);
8722 SDValue Chain = Op.getOperand(0);
8723
8724 // We need to simulate the 's_trap 2' instruction on targets that run in
8725 // PRIV=1 (where it is treated as a nop).
8726 if (Subtarget->hasPrivEnabledTrap2NopBug())
8727 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8728
8729 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8730 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8731 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8732}
8733
8734SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8735 SDLoc SL(Op);
8736 SDValue Chain = Op.getOperand(0);
8737 MachineFunction &MF = DAG.getMachineFunction();
8738
8739 if (!Subtarget->hasTrapHandler() ||
8740 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8741 LLVMContext &Ctx = MF.getFunction().getContext();
8742 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8743 "debugtrap handler not supported",
8744 Op.getDebugLoc(), DS_Warning));
8745 return Chain;
8746 }
8747
8748 uint64_t TrapID =
8749 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8750 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8751 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8752}
8753
8754SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8755 SelectionDAG &DAG) const {
8756 if (Subtarget->hasApertureRegs()) {
8757 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8758 ? AMDGPU::SRC_SHARED_BASE
8759 : AMDGPU::SRC_PRIVATE_BASE;
8760 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8761 !Subtarget->hasGloballyAddressableScratch()) &&
8762 "Cannot use src_private_base with globally addressable scratch!");
8763 // Note: this feature (register) is broken. When used as a 32-bit operand,
8764 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8765 // bits.
8766 //
8767 // To work around the issue, emit a 64 bit copy from this register
8768 // then extract the high bits. Note that this shouldn't even result in a
8769 // shift being emitted and simply become a pair of registers (e.g.):
8770 // s_mov_b64 s[6:7], src_shared_base
8771 // v_mov_b32_e32 v1, s7
8772 SDValue Copy =
8773 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8774 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8775 }
8776
8777 // For code object version 5, private_base and shared_base are passed through
8778 // implicit kernargs.
8779 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8783 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8784 }
8785
8786 MachineFunction &MF = DAG.getMachineFunction();
8787 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8788 Register UserSGPR = Info->getQueuePtrUserSGPR();
8789 if (UserSGPR == AMDGPU::NoRegister) {
8790 // We probably are in a function incorrectly marked with
8791 // amdgpu-no-queue-ptr. This is undefined.
8792 return DAG.getPOISON(MVT::i32);
8793 }
8794
8795 SDValue QueuePtr =
8796 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8797
8798 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8799 // private_segment_aperture_base_hi.
8800 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8801
8802 SDValue Ptr =
8803 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8804
8805 // TODO: Use custom target PseudoSourceValue.
8806 // TODO: We should use the value from the IR intrinsic call, but it might not
8807 // be available and how do we get it?
8808 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8809 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8810 commonAlignment(Align(64), StructOffset),
8813}
8814
8815/// Return true if the value is a known valid address, such that a null check is
8816/// not necessary.
8818 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8820 return true;
8821
8822 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8823 return ConstVal->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
8824
8825 // TODO: Search through arithmetic, handle arguments and loads
8826 // marked nonnull.
8827 return false;
8828}
8829
8830SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8831 SelectionDAG &DAG) const {
8832 SDLoc SL(Op);
8833
8834 const AMDGPUTargetMachine &TM =
8835 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8836
8837 unsigned DestAS, SrcAS;
8838 SDValue Src;
8839 bool IsNonNull = false;
8840 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8841 SrcAS = ASC->getSrcAddressSpace();
8842 Src = ASC->getOperand(0);
8843 DestAS = ASC->getDestAddressSpace();
8844 } else {
8845 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8846 Op.getConstantOperandVal(0) ==
8847 Intrinsic::amdgcn_addrspacecast_nonnull);
8848 Src = Op->getOperand(1);
8849 SrcAS = Op->getConstantOperandVal(2);
8850 DestAS = Op->getConstantOperandVal(3);
8851 IsNonNull = true;
8852 }
8853
8854 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8855
8856 // flat -> local/private
8857 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8858 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8859 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8860 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8861
8862 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8863 Subtarget->hasGloballyAddressableScratch()) {
8864 // flat -> private with globally addressable scratch: subtract
8865 // src_flat_scratch_base_lo.
8866 SDValue FlatScratchBaseLo(
8867 DAG.getMachineNode(
8868 AMDGPU::S_MOV_B32, SL, MVT::i32,
8869 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8870 0);
8871 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8872 }
8873
8874 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8875 return Ptr;
8876
8877 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
8878 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8879 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8880
8881 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8882 SegmentNullPtr);
8883 }
8884 }
8885
8886 // local/private -> flat
8887 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8888 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8889 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8890 SDValue CvtPtr;
8891 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8892 Subtarget->hasGloballyAddressableScratch()) {
8893 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8894 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8895 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8896 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8897 ThreadID = DAG.getNode(
8898 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8899 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8900 AllOnes, ThreadID);
8901 if (Subtarget->isWave64())
8902 ThreadID = DAG.getNode(
8903 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8904 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8905 AllOnes, ThreadID);
8906 SDValue ShAmt = DAG.getShiftAmountConstant(
8907 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8908 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8909 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8910 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8911 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8912 // 64-bit hi:lo value.
8913 SDValue FlatScratchBase = {
8914 DAG.getMachineNode(
8915 AMDGPU::S_MOV_B64, SL, MVT::i64,
8916 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8917 0};
8918 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8919 } else {
8920 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8921 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8922 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8923 }
8924
8925 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8926 return CvtPtr;
8927
8928 unsigned NullVal = AMDGPU::getNullPointerValue(SrcAS);
8929 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8930
8931 SDValue NonNull =
8932 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8933
8934 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8935 FlatNullPtr);
8936 }
8937 }
8938
8939 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8940 Op.getValueType() == MVT::i64) {
8941 const SIMachineFunctionInfo *Info =
8942 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8943 if (Info->get32BitAddressHighBits() == 0)
8944 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
8945
8946 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8947 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8948 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8949 }
8950
8951 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8952 Src.getValueType() == MVT::i64)
8953 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8954
8955 // global <-> flat are no-ops and never emitted.
8956
8957 // Invalid casts are poison.
8958 return DAG.getPOISON(Op->getValueType(0));
8959}
8960
8961// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8962// the small vector and inserting them into the big vector. That is better than
8963// the default expansion of doing it via a stack slot. Even though the use of
8964// the stack slot would be optimized away afterwards, the stack slot itself
8965// remains.
8966SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8967 SelectionDAG &DAG) const {
8968 SDValue Vec = Op.getOperand(0);
8969 SDValue Ins = Op.getOperand(1);
8970 SDValue Idx = Op.getOperand(2);
8971 EVT VecVT = Vec.getValueType();
8972 EVT InsVT = Ins.getValueType();
8973 EVT EltVT = VecVT.getVectorElementType();
8974 unsigned InsNumElts = InsVT.getVectorNumElements();
8975 unsigned IdxVal = Idx->getAsZExtVal();
8976 SDLoc SL(Op);
8977
8978 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8979 // Insert 32-bit registers at a time.
8980 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8981
8982 unsigned VecNumElts = VecVT.getVectorNumElements();
8983 EVT NewVecVT =
8984 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8985 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8987 MVT::i32, InsNumElts / 2);
8988
8989 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8990 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8991
8992 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8993 SDValue Elt;
8994 if (InsNumElts == 2) {
8995 Elt = Ins;
8996 } else {
8997 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8998 DAG.getConstant(I, SL, MVT::i32));
8999 }
9000 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
9001 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
9002 }
9003
9004 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
9005 }
9006
9007 for (unsigned I = 0; I != InsNumElts; ++I) {
9008 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
9009 DAG.getConstant(I, SL, MVT::i32));
9010 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
9011 DAG.getConstant(IdxVal + I, SL, MVT::i32));
9012 }
9013 return Vec;
9014}
9015
9016SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
9017 SelectionDAG &DAG) const {
9018 SDValue Vec = Op.getOperand(0);
9019 SDValue InsVal = Op.getOperand(1);
9020 SDValue Idx = Op.getOperand(2);
9021 EVT VecVT = Vec.getValueType();
9022 EVT EltVT = VecVT.getVectorElementType();
9023 unsigned VecSize = VecVT.getSizeInBits();
9024 unsigned EltSize = EltVT.getSizeInBits();
9025 SDLoc SL(Op);
9026
9027 // Specially handle the case of v4i16 with static indexing.
9028 unsigned NumElts = VecVT.getVectorNumElements();
9029 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
9030 if (NumElts == 4 && EltSize == 16 && KIdx) {
9031 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
9032
9033 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
9034 DAG.getConstant(0, SL, MVT::i32));
9035 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
9036 DAG.getConstant(1, SL, MVT::i32));
9037
9038 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
9039 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
9040
9041 unsigned Idx = KIdx->getZExtValue();
9042 bool InsertLo = Idx < 2;
9043 SDValue InsHalf = DAG.getNode(
9044 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
9045 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
9046 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
9047
9048 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
9049
9050 SDValue Concat =
9051 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
9052 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
9053
9054 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
9055 }
9056
9057 // Static indexing does not lower to stack access, and hence there is no need
9058 // for special custom lowering to avoid stack access.
9059 if (isa<ConstantSDNode>(Idx))
9060 return SDValue();
9061
9062 // Avoid stack access for dynamic indexing by custom lowering to
9063 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
9064
9065 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
9066
9067 MVT IntVT = MVT::getIntegerVT(VecSize);
9068
9069 // Convert vector index to bit-index and get the required bit mask.
9070 assert(isPowerOf2_32(EltSize));
9071 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
9072 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
9073 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
9074 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
9075 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
9076
9077 // 1. Create a congruent vector with the target value in each element.
9078 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
9079 DAG.getSplatBuildVector(VecVT, SL, InsVal));
9080
9081 // 2. Mask off all other indices except the required index within (1).
9082 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
9083
9084 // 3. Mask off the required index within the target vector.
9085 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
9086 SDValue RHS =
9087 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
9088
9089 // 4. Get (2) and (3) ORed into the target vector.
9090 SDValue BFI =
9091 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
9092
9093 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
9094}
9095
9096SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
9097 SelectionDAG &DAG) const {
9098 SDLoc SL(Op);
9099
9100 EVT ResultVT = Op.getValueType();
9101 SDValue Vec = Op.getOperand(0);
9102 SDValue Idx = Op.getOperand(1);
9103 EVT VecVT = Vec.getValueType();
9104 unsigned VecSize = VecVT.getSizeInBits();
9105 EVT EltVT = VecVT.getVectorElementType();
9106
9107 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
9108
9109 // Make sure we do any optimizations that will make it easier to fold
9110 // source modifiers before obscuring it with bit operations.
9111
9112 // XXX - Why doesn't this get called when vector_shuffle is expanded?
9113 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
9114 return Combined;
9115
9116 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
9117 SDValue Lo, Hi;
9118 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
9119
9120 if (VecSize == 128) {
9121 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
9122 Lo = DAG.getBitcast(LoVT,
9123 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9124 DAG.getConstant(0, SL, MVT::i32)));
9125 Hi = DAG.getBitcast(HiVT,
9126 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9127 DAG.getConstant(1, SL, MVT::i32)));
9128 } else if (VecSize == 256) {
9129 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
9130 SDValue Parts[4];
9131 for (unsigned P = 0; P < 4; ++P) {
9132 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9133 DAG.getConstant(P, SL, MVT::i32));
9134 }
9135
9136 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
9137 Parts[0], Parts[1]));
9138 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
9139 Parts[2], Parts[3]));
9140 } else {
9141 assert(VecSize == 512);
9142
9143 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
9144 SDValue Parts[8];
9145 for (unsigned P = 0; P < 8; ++P) {
9146 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9147 DAG.getConstant(P, SL, MVT::i32));
9148 }
9149
9150 Lo = DAG.getBitcast(LoVT,
9151 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
9152 Parts[0], Parts[1], Parts[2], Parts[3]));
9153 Hi = DAG.getBitcast(HiVT,
9154 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
9155 Parts[4], Parts[5], Parts[6], Parts[7]));
9156 }
9157
9158 EVT IdxVT = Idx.getValueType();
9159 unsigned NElem = VecVT.getVectorNumElements();
9160 assert(isPowerOf2_32(NElem));
9161 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
9162 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
9163 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
9164 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
9165 }
9166
9167 assert(VecSize <= 64);
9168
9169 MVT IntVT = MVT::getIntegerVT(VecSize);
9170
9171 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
9172 SDValue VecBC = peekThroughBitcasts(Vec);
9173 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
9174 SDValue Src = VecBC.getOperand(0);
9175 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
9176 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
9177 }
9178
9179 unsigned EltSize = EltVT.getSizeInBits();
9180 assert(isPowerOf2_32(EltSize));
9181
9182 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
9183
9184 // Convert vector index to bit-index (* EltSize)
9185 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
9186
9187 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
9188 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
9189
9190 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
9191 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
9192 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
9193 }
9194
9195 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
9196}
9197
9198static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
9199 assert(Elt % 2 == 0);
9200 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9201}
9202
9203static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
9204 assert(Elt % 2 == 0);
9205 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9206 !(Mask[Elt + 1] & 1);
9207}
9208
9209SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
9210 SelectionDAG &DAG) const {
9211 SDLoc SL(Op);
9212 EVT ResultVT = Op.getValueType();
9213 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
9214 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
9215 const int NewSrcNumElts = 2;
9216 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
9217 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
9218
9219 // Break up the shuffle into registers sized pieces.
9220 //
9221 // We're trying to form sub-shuffles that the register allocation pipeline
9222 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
9223 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
9224 // pair of copies into a consecutive register copy, so use the ordinary
9225 // extract_vector_elt lowering unless we can use the shuffle.
9226 //
9227 // TODO: This is a bit of hack, and we should probably always use
9228 // extract_subvector for the largest possible subvector we can (or at least
9229 // use it for PackVT aligned pieces). However we have worse support for
9230 // combines on them don't directly treat extract_subvector / insert_subvector
9231 // as legal. The DAG scheduler also ends up doing a worse job with the
9232 // extract_subvectors.
9233 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
9234
9235 // vector_shuffle <0,1,6,7> lhs, rhs
9236 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
9237 //
9238 // vector_shuffle <6,7,2,3> lhs, rhs
9239 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9240 //
9241 // vector_shuffle <6,7,0,1> lhs, rhs
9242 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9243
9244 // Avoid scalarizing when both halves are reading from consecutive elements.
9245
9246 // If we're treating 2 element shuffles as legal, also create odd-to-even
9247 // shuffles of neighboring pairs.
9248 //
9249 // vector_shuffle <3,2,7,6> lhs, rhs
9250 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9251 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
9252
9254 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
9255 if (ShouldUseConsecutiveExtract &&
9257 const int Idx = SVN->getMaskElt(I);
9258 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9259 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9260 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
9261 SVN->getOperand(VecIdx),
9262 DAG.getConstant(EltIdx, SL, MVT::i32));
9263 Pieces.push_back(SubVec);
9264 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
9266 int Idx0 = SVN->getMaskElt(I);
9267 int Idx1 = SVN->getMaskElt(I + 1);
9268
9269 SDValue SrcOp0 = SVN->getOperand(0);
9270 SDValue SrcOp1 = SrcOp0;
9271 if (Idx0 >= SrcNumElts) {
9272 SrcOp0 = SVN->getOperand(1);
9273 Idx0 -= SrcNumElts;
9274 }
9275
9276 if (Idx1 >= SrcNumElts) {
9277 SrcOp1 = SVN->getOperand(1);
9278 Idx1 -= SrcNumElts;
9279 }
9280
9281 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9282 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9283
9284 // Extract nearest even aligned piece.
9285 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
9286 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
9287 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
9288 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
9289
9290 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9291 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9292
9293 SDValue Result0 = SubVec0;
9294 SDValue Result1 = SubVec0;
9295
9296 if (SubVec0 != SubVec1) {
9297 NewMaskIdx1 += NewSrcNumElts;
9298 Result1 = SubVec1;
9299 } else {
9300 Result1 = DAG.getPOISON(PackVT);
9301 }
9302
9303 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
9304 {NewMaskIdx0, NewMaskIdx1});
9305 Pieces.push_back(Shuf);
9306 } else {
9307 const int Idx0 = SVN->getMaskElt(I);
9308 const int Idx1 = SVN->getMaskElt(I + 1);
9309 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9310 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9311 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9312 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9313
9314 SDValue Vec0 = SVN->getOperand(VecIdx0);
9315 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
9316 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
9317
9318 SDValue Vec1 = SVN->getOperand(VecIdx1);
9319 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
9320 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
9321 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
9322 }
9323 }
9324
9325 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
9326}
9327
9328SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9329 SelectionDAG &DAG) const {
9330 SDValue SVal = Op.getOperand(0);
9331 EVT ResultVT = Op.getValueType();
9332 EVT SValVT = SVal.getValueType();
9333 SDValue UndefVal = DAG.getPOISON(SValVT);
9334 SDLoc SL(Op);
9335
9337 VElts.push_back(SVal);
9338 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9339 VElts.push_back(UndefVal);
9340
9341 return DAG.getBuildVector(ResultVT, SL, VElts);
9342}
9343
9344SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9345 SelectionDAG &DAG) const {
9346 SDLoc SL(Op);
9347 EVT VT = Op.getValueType();
9348
9349 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9350 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9351
9352 SDValue Lo = Op.getOperand(0);
9353 SDValue Hi = Op.getOperand(1);
9354
9355 // Avoid adding defined bits with the zero_extend.
9356 if (Hi.isUndef()) {
9357 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9358 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
9359 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
9360 }
9361
9362 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
9363 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
9364
9365 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
9366 DAG.getConstant(16, SL, MVT::i32));
9367 if (Lo.isUndef())
9368 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
9369
9370 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9371 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
9372
9373 SDValue Or =
9374 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
9375 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
9376 }
9377
9378 // Split into 2-element chunks.
9379 const unsigned NumParts = VT.getVectorNumElements() / 2;
9380 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
9381 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
9382
9384 for (unsigned P = 0; P < NumParts; ++P) {
9385 SDValue Vec = DAG.getBuildVector(
9386 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
9387 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
9388 }
9389
9390 SDValue Blend =
9391 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
9392 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
9393}
9394
9396 const GlobalAddressSDNode *GA) const {
9397 // OSes that use ELF REL relocations (instead of RELA) can only store a
9398 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9399 // which can create arbitrary 64-bit addends. (This is only a problem for
9400 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9401 // the high 32 bits of the addend.)
9402 //
9403 // This should be kept in sync with how HasRelocationAddend is initialized in
9404 // the constructor of ELFAMDGPUAsmBackend.
9405 if (!Subtarget->isAmdHsaOS())
9406 return false;
9407
9408 // We can fold offsets for anything that doesn't require a GOT relocation.
9409 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9413}
9414
9415static SDValue
9417 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9418 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9419 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9420 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9421 // lowered to the following code sequence:
9422 //
9423 // For constant address space:
9424 // s_getpc_b64 s[0:1]
9425 // s_add_u32 s0, s0, $symbol
9426 // s_addc_u32 s1, s1, 0
9427 //
9428 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9429 // a fixup or relocation is emitted to replace $symbol with a literal
9430 // constant, which is a pc-relative offset from the encoding of the $symbol
9431 // operand to the global variable.
9432 //
9433 // For global address space:
9434 // s_getpc_b64 s[0:1]
9435 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9436 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9437 //
9438 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9439 // fixups or relocations are emitted to replace $symbol@*@lo and
9440 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9441 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9442 // operand to the global variable.
9443 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9444 assert(GAFlags != SIInstrInfo::MO_NONE);
9445
9446 SDValue Ptr =
9447 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
9448 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
9449 }
9450
9451 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
9452 SDValue PtrHi;
9453 if (GAFlags == SIInstrInfo::MO_NONE)
9454 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
9455 else
9456 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
9457 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
9458}
9459
9460SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
9461 SDValue Op,
9462 SelectionDAG &DAG) const {
9463 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
9464 SDLoc DL(GSD);
9465 EVT PtrVT = Op.getValueType();
9466
9467 const GlobalValue *GV = GSD->getGlobal();
9473 GV->hasExternalLinkage()) {
9474 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
9475 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9476 // zero-sized type in other languages to declare the dynamic shared
9477 // memory which size is not known at the compile time. They will be
9478 // allocated by the runtime and placed directly after the static
9479 // allocated ones. They all share the same offset.
9480 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
9481 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9482 // Adjust alignment for that dynamic shared memory array.
9484 MFI->setDynLDSAlign(F, GVar);
9485 MFI->setUsesDynamicLDS(true);
9486 return SDValue(
9487 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
9488 }
9489 }
9491 }
9492
9494 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
9496 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
9497 }
9498
9499 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9500 if (Subtarget->has64BitLiterals()) {
9502 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9503 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9504 0);
9505 }
9506
9507 SDValue AddrLo = DAG.getTargetGlobalAddress(
9508 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9509 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9510
9511 SDValue AddrHi = DAG.getTargetGlobalAddress(
9512 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9513 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9514
9515 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9516 }
9517
9518 if (shouldEmitFixup(GV))
9519 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9520
9521 if (shouldEmitPCReloc(GV))
9522 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9524
9525 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9527 PointerType *PtrTy =
9529 const DataLayout &DataLayout = DAG.getDataLayout();
9530 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9531 MachinePointerInfo PtrInfo =
9533
9534 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9537}
9538
9539SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9540 SelectionDAG &DAG) const {
9541 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9542 const Function &Fn = DAG.getMachineFunction().getFunction();
9543 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9544 Fn, "unsupported external symbol", Op.getDebugLoc()));
9545 return DAG.getPOISON(Op.getValueType());
9546}
9547
9549 const SDLoc &DL, SDValue V) const {
9550 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9551 // the destination register.
9552 //
9553 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9554 // so we will end up with redundant moves to m0.
9555 //
9556 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9557
9558 // A Null SDValue creates a glue result.
9559 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9560 V, Chain);
9561 return SDValue(M0, 0);
9562}
9563
9564SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9565 MVT VT,
9566 unsigned Offset) const {
9567 SDLoc SL(Op);
9568 SDValue Param = lowerKernargMemParameter(
9569 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9570 // The local size values will have the hi 16-bits as zero.
9571 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9572 DAG.getValueType(VT));
9573}
9574
9576 EVT VT) {
9579 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9580 return DAG.getPOISON(VT);
9581}
9582
9584 EVT VT) {
9587 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9588 return DAG.getPOISON(VT);
9589}
9590
9592 ArrayRef<SDValue> Elts) {
9593 assert(!Elts.empty());
9594 MVT Type;
9595 unsigned NumElts = Elts.size();
9596
9597 if (NumElts <= 12) {
9598 Type = MVT::getVectorVT(MVT::f32, NumElts);
9599 } else {
9600 assert(Elts.size() <= 16);
9601 Type = MVT::v16f32;
9602 NumElts = 16;
9603 }
9604
9605 SmallVector<SDValue, 16> VecElts(NumElts);
9606 for (unsigned i = 0; i < Elts.size(); ++i) {
9607 SDValue Elt = Elts[i];
9608 if (Elt.getValueType() != MVT::f32)
9609 Elt = DAG.getBitcast(MVT::f32, Elt);
9610 VecElts[i] = Elt;
9611 }
9612 for (unsigned i = Elts.size(); i < NumElts; ++i)
9613 VecElts[i] = DAG.getPOISON(MVT::f32);
9614
9615 if (NumElts == 1)
9616 return VecElts[0];
9617 return DAG.getBuildVector(Type, DL, VecElts);
9618}
9619
9620static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9621 SDValue Src, int ExtraElts) {
9622 EVT SrcVT = Src.getValueType();
9623
9625
9626 if (SrcVT.isVector())
9627 DAG.ExtractVectorElements(Src, Elts);
9628 else
9629 Elts.push_back(Src);
9630
9631 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9632 while (ExtraElts--)
9633 Elts.push_back(Undef);
9634
9635 return DAG.getBuildVector(CastVT, DL, Elts);
9636}
9637
9638// Re-construct the required return value for a image load intrinsic.
9639// This is more complicated due to the optional use TexFailCtrl which means the
9640// required return type is an aggregate
9642 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9643 bool Unpacked, bool IsD16, int DMaskPop,
9644 int NumVDataDwords, bool IsAtomicPacked16Bit,
9645 const SDLoc &DL) {
9646 // Determine the required return type. This is the same regardless of
9647 // IsTexFail flag
9648 EVT ReqRetVT = ResultTypes[0];
9649 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9650 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9651 ? (ReqRetNumElts + 1) / 2
9652 : ReqRetNumElts;
9653
9654 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9655
9656 MVT DataDwordVT =
9657 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9658
9659 MVT MaskPopVT =
9660 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9661
9662 SDValue Data(Result, 0);
9663 SDValue TexFail;
9664
9665 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9666 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9667 if (MaskPopVT.isVector()) {
9668 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9669 SDValue(Result, 0), ZeroIdx);
9670 } else {
9671 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9672 SDValue(Result, 0), ZeroIdx);
9673 }
9674 }
9675
9676 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9677 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9678 NumDataDwords - MaskPopDwords);
9679
9680 if (IsD16)
9681 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9682
9683 EVT LegalReqRetVT = ReqRetVT;
9684 if (!ReqRetVT.isVector()) {
9685 if (!Data.getValueType().isInteger())
9686 Data = DAG.getNode(ISD::BITCAST, DL,
9687 Data.getValueType().changeTypeToInteger(), Data);
9688 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9689 } else {
9690 // We need to widen the return vector to a legal type
9691 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9692 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9693 LegalReqRetVT =
9695 ReqRetVT.getVectorNumElements() + 1);
9696 }
9697 }
9698 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9699
9700 if (IsTexFail) {
9701 TexFail =
9702 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9703 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9704
9705 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9706 }
9707
9708 if (Result->getNumValues() == 1)
9709 return Data;
9710
9711 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9712}
9713
9714static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9715 SDValue *LWE, bool &IsTexFail) {
9716 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9717
9718 uint64_t Value = TexFailCtrlConst->getZExtValue();
9719 if (Value) {
9720 IsTexFail = true;
9721 }
9722
9723 SDLoc DL(TexFailCtrlConst);
9724 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9725 Value &= ~(uint64_t)0x1;
9726 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9727 Value &= ~(uint64_t)0x2;
9728
9729 return Value == 0;
9730}
9731
9733 MVT PackVectorVT,
9734 SmallVectorImpl<SDValue> &PackedAddrs,
9735 unsigned DimIdx, unsigned EndIdx,
9736 unsigned NumGradients) {
9737 SDLoc DL(Op);
9738 for (unsigned I = DimIdx; I < EndIdx; I++) {
9739 SDValue Addr = Op.getOperand(I);
9740
9741 // Gradients are packed with undef for each coordinate.
9742 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9743 // 1D: undef,dx/dh; undef,dx/dv
9744 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9745 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9746 if (((I + 1) >= EndIdx) ||
9747 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9748 I == DimIdx + NumGradients - 1))) {
9749 if (Addr.getValueType() != MVT::i16)
9750 Addr = DAG.getBitcast(MVT::i16, Addr);
9751 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9752 } else {
9753 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9754 I++;
9755 }
9756 Addr = DAG.getBitcast(MVT::f32, Addr);
9757 PackedAddrs.push_back(Addr);
9758 }
9759}
9760
9761SDValue SITargetLowering::lowerImage(SDValue Op,
9763 SelectionDAG &DAG, bool WithChain) const {
9764 SDLoc DL(Op);
9765 MachineFunction &MF = DAG.getMachineFunction();
9766 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9767 unsigned IntrOpcode = Intr->BaseOpcode;
9768 // For image atomic: use no-return opcode if result is unused.
9769 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9770 !Op.getNode()->hasAnyUseOfValue(0))
9771 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9772 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9774 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9775 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9776 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9777 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9778
9779 SmallVector<EVT, 3> ResultTypes(Op->values());
9780 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9781 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9782 ResultTypes.erase(&ResultTypes[0]);
9783
9784 bool IsD16 = false;
9785 bool IsG16 = false;
9786 bool IsA16 = false;
9787 SDValue VData;
9788 int NumVDataDwords = 0;
9789 bool AdjustRetType = false;
9790 bool IsAtomicPacked16Bit = false;
9791
9792 // Offset of intrinsic arguments
9793 const unsigned ArgOffset = WithChain ? 2 : 1;
9794
9795 unsigned DMask;
9796 unsigned DMaskLanes = 0;
9797
9798 if (BaseOpcode->Atomic) {
9799 VData = Op.getOperand(2);
9800
9801 IsAtomicPacked16Bit =
9802 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9803 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9804 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9805 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9806
9807 bool Is64Bit = VData.getValueSizeInBits() == 64;
9808 if (BaseOpcode->AtomicX2) {
9809 SDValue VData2 = Op.getOperand(3);
9810 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9811 {VData, VData2});
9812 if (Is64Bit)
9813 VData = DAG.getBitcast(MVT::v4i32, VData);
9814
9815 if (!BaseOpcode->NoReturn)
9816 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9817
9818 DMask = Is64Bit ? 0xf : 0x3;
9819 NumVDataDwords = Is64Bit ? 4 : 2;
9820 } else {
9821 DMask = Is64Bit ? 0x3 : 0x1;
9822 NumVDataDwords = Is64Bit ? 2 : 1;
9823 }
9824 } else {
9825 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9826 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9827
9828 if (BaseOpcode->Store) {
9829 VData = Op.getOperand(2);
9830
9831 MVT StoreVT = VData.getSimpleValueType();
9832 if (StoreVT.getScalarType() == MVT::f16) {
9833 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9834 return Op; // D16 is unsupported for this instruction
9835
9836 IsD16 = true;
9837 VData = handleD16VData(VData, DAG, true);
9838 }
9839
9840 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9841 } else if (!BaseOpcode->NoReturn) {
9842 // Work out the num dwords based on the dmask popcount and underlying type
9843 // and whether packing is supported.
9844 MVT LoadVT = ResultTypes[0].getSimpleVT();
9845 if (LoadVT.getScalarType() == MVT::f16) {
9846 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9847 return Op; // D16 is unsupported for this instruction
9848
9849 IsD16 = true;
9850 }
9851
9852 // Confirm that the return type is large enough for the dmask specified
9853 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9854 (!LoadVT.isVector() && DMaskLanes > 1))
9855 return Op;
9856
9857 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9858 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9859 // instructions.
9860 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9861 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9862 NumVDataDwords = (DMaskLanes + 1) / 2;
9863 else
9864 NumVDataDwords = DMaskLanes;
9865
9866 AdjustRetType = true;
9867 }
9868 }
9869
9870 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9872
9873 // Check for 16 bit addresses or derivatives and pack if true.
9874 MVT VAddrVT =
9875 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9876 MVT VAddrScalarVT = VAddrVT.getScalarType();
9877 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9878 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9879
9880 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9881 VAddrScalarVT = VAddrVT.getScalarType();
9882 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9883 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9884
9885 // Push back extra arguments.
9886 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9887 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9888 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9889 // Special handling of bias when A16 is on. Bias is of type half but
9890 // occupies full 32-bit.
9891 SDValue Bias = DAG.getBuildVector(
9892 MVT::v2f16, DL,
9893 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9894 VAddrs.push_back(Bias);
9895 } else {
9896 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9897 "Bias needs to be converted to 16 bit in A16 mode");
9898 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9899 }
9900 }
9901
9902 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9903 // 16 bit gradients are supported, but are tied to the A16 control
9904 // so both gradients and addresses must be 16 bit
9905 LLVM_DEBUG(
9906 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9907 "require 16 bit args for both gradients and addresses");
9908 return Op;
9909 }
9910
9911 if (IsA16) {
9912 if (!ST->hasA16()) {
9913 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9914 "support 16 bit addresses\n");
9915 return Op;
9916 }
9917 }
9918
9919 // We've dealt with incorrect input so we know that if IsA16, IsG16
9920 // are set then we have to compress/pack operands (either address,
9921 // gradient or both)
9922 // In the case where a16 and gradients are tied (no G16 support) then we
9923 // have already verified that both IsA16 and IsG16 are true
9924 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9925 // Activate g16
9926 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9928 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9929 }
9930
9931 // Add gradients (packed or unpacked)
9932 if (IsG16) {
9933 // Pack the gradients
9934 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9935 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9936 ArgOffset + Intr->GradientStart,
9937 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9938 } else {
9939 for (unsigned I = ArgOffset + Intr->GradientStart;
9940 I < ArgOffset + Intr->CoordStart; I++)
9941 VAddrs.push_back(Op.getOperand(I));
9942 }
9943
9944 // Add addresses (packed or unpacked)
9945 if (IsA16) {
9946 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9947 ArgOffset + Intr->CoordStart, VAddrEnd,
9948 0 /* No gradients */);
9949 } else {
9950 // Add uncompressed address
9951 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9952 VAddrs.push_back(Op.getOperand(I));
9953 }
9954
9955 // If the register allocator cannot place the address registers contiguously
9956 // without introducing moves, then using the non-sequential address encoding
9957 // is always preferable, since it saves VALU instructions and is usually a
9958 // wash in terms of code size or even better.
9959 //
9960 // However, we currently have no way of hinting to the register allocator that
9961 // MIMG addresses should be placed contiguously when it is possible to do so,
9962 // so force non-NSA for the common 2-address case as a heuristic.
9963 //
9964 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9965 // allocation when possible.
9966 //
9967 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9968 // set of the remaining addresses.
9969 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9970 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9971 const bool UseNSA = ST->hasNSAEncoding() &&
9972 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9973 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9974 const bool UsePartialNSA =
9975 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9976
9977 SDValue VAddr;
9978 if (UsePartialNSA) {
9979 VAddr = getBuildDwordsVector(DAG, DL,
9980 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9981 } else if (!UseNSA) {
9982 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9983 }
9984
9985 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9986 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9987 SDValue Unorm;
9988 if (!BaseOpcode->Sampler) {
9989 Unorm = True;
9990 } else {
9991 uint64_t UnormConst =
9992 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9993
9994 Unorm = UnormConst ? True : False;
9995 }
9996
9997 SDValue TFE;
9998 SDValue LWE;
9999 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
10000 bool IsTexFail = false;
10001 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
10002 return Op;
10003
10004 if (IsTexFail) {
10005 if (!DMaskLanes) {
10006 // Expecting to get an error flag since TFC is on - and dmask is 0
10007 // Force dmask to be at least 1 otherwise the instruction will fail
10008 DMask = 0x1;
10009 DMaskLanes = 1;
10010 NumVDataDwords = 1;
10011 }
10012 NumVDataDwords += 1;
10013 AdjustRetType = true;
10014 }
10015
10016 // Has something earlier tagged that the return type needs adjusting
10017 // This happens if the instruction is a load or has set TexFailCtrl flags
10018 if (AdjustRetType) {
10019 // NumVDataDwords reflects the true number of dwords required in the return
10020 // type
10021 if (DMaskLanes == 0 && !BaseOpcode->Store) {
10022 // This is a no-op load. This can be eliminated
10023 SDValue Undef = DAG.getPOISON(Op.getValueType());
10024 if (isa<MemSDNode>(Op))
10025 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
10026 return Undef;
10027 }
10028
10029 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
10030 MVT::i32, NumVDataDwords)
10031 : MVT::i32;
10032
10033 ResultTypes[0] = NewVT;
10034 if (ResultTypes.size() == 3) {
10035 // Original result was aggregate type used for TexFailCtrl results
10036 // The actual instruction returns as a vector type which has now been
10037 // created. Remove the aggregate result.
10038 ResultTypes.erase(&ResultTypes[1]);
10039 }
10040 }
10041
10042 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
10043 // Keep GLC only when the atomic's result is actually used.
10044 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
10046 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
10048 return Op;
10049
10051 if (BaseOpcode->Store || BaseOpcode->Atomic)
10052 Ops.push_back(VData); // vdata
10053 if (UsePartialNSA) {
10054 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
10055 Ops.push_back(VAddr);
10056 } else if (UseNSA)
10057 append_range(Ops, VAddrs);
10058 else
10059 Ops.push_back(VAddr);
10060 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
10061 EVT RsrcVT = Rsrc.getValueType();
10062 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
10063 return Op;
10064 Ops.push_back(Rsrc);
10065 if (BaseOpcode->Sampler) {
10066 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
10067 if (Samp.getValueType() != MVT::v4i32)
10068 return Op;
10069 Ops.push_back(Samp);
10070 }
10071 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
10072 if (IsGFX10Plus)
10073 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
10074 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
10075 Ops.push_back(Unorm);
10076 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
10077 Ops.push_back(IsA16 && // r128, a16 for gfx9
10078 ST->hasFeature(AMDGPU::FeatureR128A16)
10079 ? True
10080 : False);
10081 if (IsGFX10Plus)
10082 Ops.push_back(IsA16 ? True : False);
10083
10084 if (!Subtarget->hasGFX90AInsts())
10085 Ops.push_back(TFE); // tfe
10086 else if (TFE->getAsZExtVal()) {
10087 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10089 "TFE is not supported on this GPU", DL.getDebugLoc()));
10090 }
10091
10092 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
10093 Ops.push_back(LWE); // lwe
10094 if (!IsGFX10Plus)
10095 Ops.push_back(DimInfo->DA ? True : False);
10096 if (BaseOpcode->HasD16)
10097 Ops.push_back(IsD16 ? True : False);
10098 if (isa<MemSDNode>(Op))
10099 Ops.push_back(Op.getOperand(0)); // chain
10100
10101 int NumVAddrDwords =
10102 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
10103 int Opcode = -1;
10104
10105 if (IsGFX12Plus) {
10106 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
10107 NumVDataDwords, NumVAddrDwords);
10108 } else if (IsGFX11Plus) {
10109 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
10110 UseNSA ? AMDGPU::MIMGEncGfx11NSA
10111 : AMDGPU::MIMGEncGfx11Default,
10112 NumVDataDwords, NumVAddrDwords);
10113 } else if (IsGFX10Plus) {
10114 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
10115 UseNSA ? AMDGPU::MIMGEncGfx10NSA
10116 : AMDGPU::MIMGEncGfx10Default,
10117 NumVDataDwords, NumVAddrDwords);
10118 } else {
10119 if (Subtarget->hasGFX90AInsts()) {
10120 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
10121 NumVDataDwords, NumVAddrDwords);
10122 if (Opcode == -1) {
10123 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10125 "requested image instruction is not supported on this GPU",
10126 DL.getDebugLoc()));
10127
10128 unsigned Idx = 0;
10129 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
10130 for (EVT VT : OrigResultTypes) {
10131 if (VT == MVT::Other)
10132 RetValues[Idx++] = Op.getOperand(0); // Chain
10133 else
10134 RetValues[Idx++] = DAG.getPOISON(VT);
10135 }
10136
10137 return DAG.getMergeValues(RetValues, DL);
10138 }
10139 }
10140 if (Opcode == -1 &&
10141 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10142 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
10143 NumVDataDwords, NumVAddrDwords);
10144 if (Opcode == -1)
10145 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
10146 NumVDataDwords, NumVAddrDwords);
10147 }
10148 if (Opcode == -1)
10149 return Op;
10150
10151 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
10152 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
10153 MachineMemOperand *MemRef = MemOp->getMemOperand();
10154 DAG.setNodeMemRefs(NewNode, {MemRef});
10155 }
10156
10157 if (BaseOpcode->NoReturn) {
10158 if (BaseOpcode->Atomic)
10159 return DAG.getMergeValues(
10160 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
10161
10162 return SDValue(NewNode, 0);
10163 }
10164
10165 if (BaseOpcode->AtomicX2) {
10167 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
10168 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
10169 }
10170
10171 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
10172 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
10173 NumVDataDwords, IsAtomicPacked16Bit, DL);
10174}
10175
10176SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
10177 SDValue Offset, SDValue CachePolicy,
10178 SelectionDAG &DAG) const {
10179 MachineFunction &MF = DAG.getMachineFunction();
10180
10181 const DataLayout &DataLayout = DAG.getDataLayout();
10182 Align Alignment =
10183 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
10184
10185 MachineMemOperand *MMO = MF.getMachineMemOperand(
10186 MachinePointerInfo(),
10189 VT.getStoreSize(), Alignment);
10190
10191 if (!Offset->isDivergent()) {
10192 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
10193
10194 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
10195 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
10196 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
10197 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
10198 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10199 SDValue BufferLoad =
10200 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
10201 DAG.getVTList(MVT::i32), Ops, VT, MMO);
10202 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
10203 }
10204
10205 // Widen vec3 load to vec4.
10206 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
10207 !Subtarget->hasScalarDwordx3Loads()) {
10208 EVT WidenedVT =
10210 auto WidenedOp = DAG.getMemIntrinsicNode(
10211 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
10212 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
10213 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
10214 DAG.getVectorIdxConstant(0, DL));
10215 return Subvector;
10216 }
10217
10218 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
10219 DAG.getVTList(VT), Ops, VT, MMO);
10220 }
10221
10222 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
10223 // assume that the buffer is unswizzled.
10224 SDValue Ops[] = {
10225 DAG.getEntryNode(), // Chain
10226 Rsrc, // rsrc
10227 DAG.getConstant(0, DL, MVT::i32), // vindex
10228 {}, // voffset
10229 {}, // soffset
10230 {}, // offset
10231 CachePolicy, // cachepolicy
10232 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10233 };
10234 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10235 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
10236 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
10237 }
10238
10240 unsigned NumLoads = 1;
10241 MVT LoadVT = VT.getSimpleVT();
10242 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
10243 assert((LoadVT.getScalarType() == MVT::i32 ||
10244 LoadVT.getScalarType() == MVT::f32));
10245
10246 if (NumElts == 8 || NumElts == 16) {
10247 NumLoads = NumElts / 4;
10248 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
10249 }
10250
10251 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
10252
10253 // Use the alignment to ensure that the required offsets will fit into the
10254 // immediate offsets.
10255 setBufferOffsets(Offset, DAG, &Ops[3],
10256 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
10257
10258 uint64_t InstOffset = Ops[5]->getAsZExtVal();
10259 for (unsigned i = 0; i < NumLoads; ++i) {
10260 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
10261 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10262 LoadVT, MMO, DAG));
10263 }
10264
10265 if (NumElts == 8 || NumElts == 16)
10266 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
10267
10268 return Loads[0];
10269}
10270
10271SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10272 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10273 if (!Subtarget->hasArchitectedSGPRs())
10274 return {};
10275 SDLoc SL(Op);
10276 MVT VT = MVT::i32;
10277 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
10278 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10279 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
10280}
10281
10282SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10283 AMDGPU::Hwreg::Id HwReg,
10284 unsigned LowBit,
10285 unsigned Width) const {
10286 SDLoc SL(Op);
10287 using namespace AMDGPU::Hwreg;
10288 return {DAG.getMachineNode(
10289 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10290 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
10291 SL, MVT::i32)),
10292 0};
10293}
10294
10295SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10296 unsigned Dim,
10297 const ArgDescriptor &Arg) const {
10298 SDLoc SL(Op);
10299 MachineFunction &MF = DAG.getMachineFunction();
10300 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
10301 if (MaxID == 0)
10302 return DAG.getConstant(0, SL, MVT::i32);
10303
10304 // It's undefined behavior if a function marked with the amdgpu-no-*
10305 // attributes uses the corresponding intrinsic.
10306 if (!Arg)
10307 return DAG.getPOISON(Op->getValueType(0));
10308
10309 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
10310 SDLoc(DAG.getEntryNode()), Arg);
10311
10312 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10313 // masking operations anyway.
10314 //
10315 // TODO: We could assert the top bit is 0 for the source copy.
10316 if (Arg.isMasked())
10317 return Val;
10318
10319 // Preserve the known bits after expansion to a copy.
10320 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
10321 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
10322 DAG.getValueType(SmallVT));
10323}
10324
10325SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10326 SelectionDAG &DAG) const {
10327 MachineFunction &MF = DAG.getMachineFunction();
10328 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10329
10330 EVT VT = Op.getValueType();
10331 SDLoc DL(Op);
10332 unsigned IntrinsicID = Op.getConstantOperandVal(0);
10333
10334 // TODO: Should this propagate fast-math-flags?
10335
10336 switch (IntrinsicID) {
10337 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10338 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
10339 return emitNonHSAIntrinsicError(DAG, DL, VT);
10340 return getPreloadedValue(DAG, *MFI, VT,
10342 }
10343 case Intrinsic::amdgcn_dispatch_ptr:
10344 case Intrinsic::amdgcn_queue_ptr: {
10345 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
10346 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10347 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10348 DL.getDebugLoc()));
10349 return DAG.getPOISON(VT);
10350 }
10351
10352 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10355 return getPreloadedValue(DAG, *MFI, VT, RegID);
10356 }
10357 case Intrinsic::amdgcn_implicitarg_ptr: {
10358 if (MFI->isEntryFunction())
10359 return getImplicitArgPtr(DAG, DL);
10360 return getPreloadedValue(DAG, *MFI, VT,
10362 }
10363 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10364 if (!AMDGPU::isKernel(MF.getFunction())) {
10365 // This only makes sense to call in a kernel, so just lower to null.
10366 return DAG.getConstant(0, DL, VT);
10367 }
10368
10369 return getPreloadedValue(DAG, *MFI, VT,
10371 }
10372 case Intrinsic::amdgcn_dispatch_id: {
10373 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
10374 }
10375 case Intrinsic::amdgcn_rcp:
10376 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
10377 case Intrinsic::amdgcn_rsq:
10378 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10379 case Intrinsic::amdgcn_rsq_legacy:
10380 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10381 return emitRemovedIntrinsicError(DAG, DL, VT);
10382 return SDValue();
10383 case Intrinsic::amdgcn_rcp_legacy:
10384 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10385 return emitRemovedIntrinsicError(DAG, DL, VT);
10386 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
10387 case Intrinsic::amdgcn_rsq_clamp: {
10388 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10389 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
10390
10391 Type *Type = VT.getTypeForEVT(*DAG.getContext());
10392 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
10393 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
10394
10395 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10396 SDValue Tmp =
10397 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
10398 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
10399 DAG.getConstantFP(Min, DL, VT));
10400 }
10401 case Intrinsic::r600_read_ngroups_x:
10402 if (Subtarget->isAmdHsaOS())
10403 return emitNonHSAIntrinsicError(DAG, DL, VT);
10404
10405 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10407 false);
10408 case Intrinsic::r600_read_ngroups_y:
10409 if (Subtarget->isAmdHsaOS())
10410 return emitNonHSAIntrinsicError(DAG, DL, VT);
10411
10412 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10414 false);
10415 case Intrinsic::r600_read_ngroups_z:
10416 if (Subtarget->isAmdHsaOS())
10417 return emitNonHSAIntrinsicError(DAG, DL, VT);
10418
10419 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10421 false);
10422 case Intrinsic::r600_read_local_size_x:
10423 if (Subtarget->isAmdHsaOS())
10424 return emitNonHSAIntrinsicError(DAG, DL, VT);
10425
10426 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10428 case Intrinsic::r600_read_local_size_y:
10429 if (Subtarget->isAmdHsaOS())
10430 return emitNonHSAIntrinsicError(DAG, DL, VT);
10431
10432 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10434 case Intrinsic::r600_read_local_size_z:
10435 if (Subtarget->isAmdHsaOS())
10436 return emitNonHSAIntrinsicError(DAG, DL, VT);
10437
10438 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10440 case Intrinsic::amdgcn_workgroup_id_x:
10441 return lowerWorkGroupId(DAG, *MFI, VT,
10445 case Intrinsic::amdgcn_workgroup_id_y:
10446 return lowerWorkGroupId(DAG, *MFI, VT,
10450 case Intrinsic::amdgcn_workgroup_id_z:
10451 return lowerWorkGroupId(DAG, *MFI, VT,
10455 case Intrinsic::amdgcn_cluster_id_x:
10456 return Subtarget->hasClusters()
10457 ? getPreloadedValue(DAG, *MFI, VT,
10459 : DAG.getPOISON(VT);
10460 case Intrinsic::amdgcn_cluster_id_y:
10461 return Subtarget->hasClusters()
10462 ? getPreloadedValue(DAG, *MFI, VT,
10464 : DAG.getPOISON(VT);
10465 case Intrinsic::amdgcn_cluster_id_z:
10466 return Subtarget->hasClusters()
10467 ? getPreloadedValue(DAG, *MFI, VT,
10469 : DAG.getPOISON(VT);
10470 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10471 return Subtarget->hasClusters()
10472 ? getPreloadedValue(
10473 DAG, *MFI, VT,
10475 : DAG.getPOISON(VT);
10476 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10477 return Subtarget->hasClusters()
10478 ? getPreloadedValue(
10479 DAG, *MFI, VT,
10481 : DAG.getPOISON(VT);
10482 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10483 return Subtarget->hasClusters()
10484 ? getPreloadedValue(
10485 DAG, *MFI, VT,
10487 : DAG.getPOISON(VT);
10488 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10489 return Subtarget->hasClusters()
10490 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
10491 : SDValue();
10492 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10493 return Subtarget->hasClusters()
10494 ? getPreloadedValue(
10495 DAG, *MFI, VT,
10497 : DAG.getPOISON(VT);
10498 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10499 return Subtarget->hasClusters()
10500 ? getPreloadedValue(
10501 DAG, *MFI, VT,
10503 : DAG.getPOISON(VT);
10504 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10505 return Subtarget->hasClusters()
10506 ? getPreloadedValue(
10507 DAG, *MFI, VT,
10509 : DAG.getPOISON(VT);
10510 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10511 return Subtarget->hasClusters()
10512 ? getPreloadedValue(
10513 DAG, *MFI, VT,
10515 : DAG.getPOISON(VT);
10516 case Intrinsic::amdgcn_wave_id:
10517 return lowerWaveID(DAG, Op);
10518 case Intrinsic::amdgcn_lds_kernel_id: {
10519 if (MFI->isEntryFunction())
10520 return getLDSKernelId(DAG, DL);
10521 return getPreloadedValue(DAG, *MFI, VT,
10523 }
10524 case Intrinsic::amdgcn_workitem_id_x:
10525 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10526 case Intrinsic::amdgcn_workitem_id_y:
10527 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10528 case Intrinsic::amdgcn_workitem_id_z:
10529 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10530 case Intrinsic::amdgcn_wavefrontsize:
10531 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10532 SDLoc(Op), MVT::i32);
10533 case Intrinsic::amdgcn_s_buffer_load: {
10534 unsigned CPol = Op.getConstantOperandVal(3);
10535 // s_buffer_load, because of how it's optimized, can't be volatile
10536 // so reject ones with the volatile bit set.
10537 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10540 return Op;
10541 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10542 Op.getOperand(3), DAG);
10543 }
10544 case Intrinsic::amdgcn_fdiv_fast:
10545 return lowerFDIV_FAST(Op, DAG);
10546 case Intrinsic::amdgcn_sin:
10547 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10548
10549 case Intrinsic::amdgcn_cos:
10550 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10551
10552 case Intrinsic::amdgcn_mul_u24:
10553 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10554 Op.getOperand(2));
10555 case Intrinsic::amdgcn_mul_i24:
10556 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10557 Op.getOperand(2));
10558
10559 case Intrinsic::amdgcn_log_clamp: {
10560 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10561 return SDValue();
10562
10563 return emitRemovedIntrinsicError(DAG, DL, VT);
10564 }
10565 case Intrinsic::amdgcn_fract:
10566 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10567
10568 case Intrinsic::amdgcn_class:
10569 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10570 Op.getOperand(2));
10571 case Intrinsic::amdgcn_div_fmas:
10572 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10573 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10574
10575 case Intrinsic::amdgcn_div_fixup:
10576 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10577 Op.getOperand(2), Op.getOperand(3));
10578
10579 case Intrinsic::amdgcn_div_scale: {
10580 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10581
10582 // Translate to the operands expected by the machine instruction. The
10583 // first parameter must be the same as the first instruction.
10584 SDValue Numerator = Op.getOperand(1);
10585 SDValue Denominator = Op.getOperand(2);
10586
10587 // Note this order is opposite of the machine instruction's operations,
10588 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10589 // intrinsic has the numerator as the first operand to match a normal
10590 // division operation.
10591
10592 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10593
10594 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10595 Denominator, Numerator);
10596 }
10597 case Intrinsic::amdgcn_icmp: {
10598 // There is a Pat that handles this variant, so return it as-is.
10599 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10600 Op.getConstantOperandVal(2) == 0 &&
10601 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10602 return Op;
10603 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10604 }
10605 case Intrinsic::amdgcn_fcmp: {
10606 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10607 }
10608 case Intrinsic::amdgcn_ballot:
10609 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10610 case Intrinsic::amdgcn_fmed3:
10611 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10612 Op.getOperand(2), Op.getOperand(3));
10613 case Intrinsic::amdgcn_fdot2:
10614 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10615 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10616 case Intrinsic::amdgcn_fmul_legacy:
10617 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10618 Op.getOperand(2));
10619 case Intrinsic::amdgcn_sffbh:
10620 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10621 case Intrinsic::amdgcn_sbfe:
10622 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10623 Op.getOperand(2), Op.getOperand(3));
10624 case Intrinsic::amdgcn_ubfe:
10625 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10626 Op.getOperand(2), Op.getOperand(3));
10627 case Intrinsic::amdgcn_cvt_pkrtz:
10628 case Intrinsic::amdgcn_cvt_pknorm_i16:
10629 case Intrinsic::amdgcn_cvt_pknorm_u16:
10630 case Intrinsic::amdgcn_cvt_pk_i16:
10631 case Intrinsic::amdgcn_cvt_pk_u16: {
10632 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10633 EVT VT = Op.getValueType();
10634 unsigned Opcode;
10635
10636 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10637 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10638 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10639 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10640 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10641 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10642 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10643 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10644 else
10645 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10646
10647 if (isTypeLegal(VT))
10648 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10649
10650 SDValue Node =
10651 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10652 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10653 }
10654 case Intrinsic::amdgcn_fmad_ftz:
10655 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10656 Op.getOperand(2), Op.getOperand(3));
10657
10658 case Intrinsic::amdgcn_if_break:
10659 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10660 Op->getOperand(1), Op->getOperand(2)),
10661 0);
10662
10663 case Intrinsic::amdgcn_groupstaticsize: {
10665 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10666 return Op;
10667
10668 const Module *M = MF.getFunction().getParent();
10669 const GlobalValue *GV =
10670 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10671 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10673 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10674 }
10675 case Intrinsic::amdgcn_is_shared:
10676 case Intrinsic::amdgcn_is_private: {
10677 SDLoc SL(Op);
10678 SDValue SrcVec =
10679 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10680 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10681 DAG.getConstant(1, SL, MVT::i32));
10682
10683 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10685 : AMDGPUAS::PRIVATE_ADDRESS;
10686 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10687 Subtarget->hasGloballyAddressableScratch()) {
10688 SDValue FlatScratchBaseHi(
10689 DAG.getMachineNode(
10690 AMDGPU::S_MOV_B32, DL, MVT::i32,
10691 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10692 0);
10693 // Test bits 63..58 against the aperture address.
10694 return DAG.getSetCC(
10695 SL, MVT::i1,
10696 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10697 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10698 }
10699
10700 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10701 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10702 }
10703 case Intrinsic::amdgcn_perm:
10704 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10705 Op.getOperand(2), Op.getOperand(3));
10706 case Intrinsic::amdgcn_reloc_constant: {
10707 Module *M = MF.getFunction().getParent();
10708 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10709 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10710 auto *RelocSymbol = cast<GlobalVariable>(
10711 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10712 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10714 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10715 }
10716 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10717 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10718 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10719 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10720 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10721 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10722 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10723 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10724 if (Op.getOperand(4).getValueType() == MVT::i32)
10725 return SDValue();
10726
10727 SDLoc SL(Op);
10728 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10729 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10730 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10731 Op.getOperand(3), IndexKeyi32);
10732 }
10733 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10734 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10735 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10736 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10737 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10738 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10739 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10740 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10741 if (Op.getOperand(4).getValueType() == MVT::i64)
10742 return SDValue();
10743
10744 SDLoc SL(Op);
10745 auto IndexKeyi64 =
10746 Op.getOperand(4).getValueType() == MVT::v2i32
10747 ? DAG.getBitcast(MVT::i64, Op.getOperand(4))
10748 : DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10749 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10750 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10751 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10752 Op.getOperand(6)});
10753 }
10754 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10755 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10756 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10757 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10758 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10759 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10760 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10761 ? MVT::i64
10762 : MVT::i32;
10763 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10764 return SDValue();
10765
10766 SDLoc SL(Op);
10767 auto IndexKey =
10768 Op.getOperand(6).getValueType().isVector()
10769 ? DAG.getBitcast(IndexKeyTy, Op.getOperand(6))
10770 : DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10772 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10773 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10774 IndexKey, Op.getOperand(7), Op.getOperand(8)};
10775 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10776 Args.push_back(Op.getOperand(9));
10777 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
10778 }
10779 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10780 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10781 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10782 if (Op.getOperand(6).getValueType() == MVT::i32)
10783 return SDValue();
10784
10785 SDLoc SL(Op);
10786 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10787 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10788 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10789 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10790 IndexKeyi32, Op.getOperand(7)});
10791 }
10792 case Intrinsic::amdgcn_addrspacecast_nonnull:
10793 return lowerADDRSPACECAST(Op, DAG);
10794 case Intrinsic::amdgcn_readlane:
10795 case Intrinsic::amdgcn_readfirstlane:
10796 case Intrinsic::amdgcn_writelane:
10797 case Intrinsic::amdgcn_permlane16:
10798 case Intrinsic::amdgcn_permlanex16:
10799 case Intrinsic::amdgcn_permlane64:
10800 case Intrinsic::amdgcn_set_inactive:
10801 case Intrinsic::amdgcn_set_inactive_chain_arg:
10802 case Intrinsic::amdgcn_mov_dpp8:
10803 case Intrinsic::amdgcn_update_dpp:
10804 return lowerLaneOp(*this, Op.getNode(), DAG);
10805 case Intrinsic::amdgcn_dead: {
10807 for (const EVT ValTy : Op.getNode()->values())
10808 Poisons.push_back(DAG.getPOISON(ValTy));
10809 return DAG.getMergeValues(Poisons, SDLoc(Op));
10810 }
10811 case Intrinsic::amdgcn_wave_shuffle:
10812 return lowerWaveShuffle(*this, Op.getNode(), DAG);
10813 default:
10814 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10816 return lowerImage(Op, ImageDimIntr, DAG, false);
10817
10818 return Op;
10819 }
10820}
10821
10822// On targets not supporting constant in soffset field, turn zero to
10823// SGPR_NULL to avoid generating an extra s_mov with zero.
10825 const GCNSubtarget *Subtarget) {
10826 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10827 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10828 return SOffset;
10829}
10830
10831SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10832 SelectionDAG &DAG,
10833 unsigned NewOpcode) const {
10834 SDLoc DL(Op);
10835
10836 SDValue VData = Op.getOperand(2);
10837 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10838 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10839 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10840 SDValue Ops[] = {
10841 Op.getOperand(0), // Chain
10842 VData, // vdata
10843 Rsrc, // rsrc
10844 DAG.getConstant(0, DL, MVT::i32), // vindex
10845 VOffset, // voffset
10846 SOffset, // soffset
10847 Offset, // offset
10848 Op.getOperand(6), // cachepolicy
10849 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10850 };
10851
10852 auto *M = cast<MemSDNode>(Op);
10853
10854 EVT MemVT = VData.getValueType();
10855 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10856 M->getMemOperand());
10857}
10858
10859SDValue
10860SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10861 unsigned NewOpcode) const {
10862 SDLoc DL(Op);
10863
10864 SDValue VData = Op.getOperand(2);
10865 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10866 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10867 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10868 SDValue Ops[] = {
10869 Op.getOperand(0), // Chain
10870 VData, // vdata
10871 Rsrc, // rsrc
10872 Op.getOperand(4), // vindex
10873 VOffset, // voffset
10874 SOffset, // soffset
10875 Offset, // offset
10876 Op.getOperand(7), // cachepolicy
10877 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10878 };
10879
10880 auto *M = cast<MemSDNode>(Op);
10881
10882 EVT MemVT = VData.getValueType();
10883 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10884 M->getMemOperand());
10885}
10886
10887SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10888 SelectionDAG &DAG) const {
10889 unsigned IntrID = Op.getConstantOperandVal(1);
10890 SDLoc DL(Op);
10891
10892 switch (IntrID) {
10893 case Intrinsic::amdgcn_ds_ordered_add:
10894 case Intrinsic::amdgcn_ds_ordered_swap: {
10895 MemSDNode *M = cast<MemSDNode>(Op);
10896 SDValue Chain = M->getOperand(0);
10897 SDValue M0 = M->getOperand(2);
10898 SDValue Value = M->getOperand(3);
10899 unsigned IndexOperand = M->getConstantOperandVal(7);
10900 unsigned WaveRelease = M->getConstantOperandVal(8);
10901 unsigned WaveDone = M->getConstantOperandVal(9);
10902
10903 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10904 IndexOperand &= ~0x3f;
10905 unsigned CountDw = 0;
10906
10907 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10908 CountDw = (IndexOperand >> 24) & 0xf;
10909 IndexOperand &= ~(0xf << 24);
10910
10911 if (CountDw < 1 || CountDw > 4) {
10912 const Function &Fn = DAG.getMachineFunction().getFunction();
10913 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10914 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10915 DL.getDebugLoc()));
10916 CountDw = 1;
10917 }
10918 }
10919
10920 if (IndexOperand) {
10921 const Function &Fn = DAG.getMachineFunction().getFunction();
10922 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10923 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10924 }
10925
10926 if (WaveDone && !WaveRelease) {
10927 // TODO: Move this to IR verifier
10928 const Function &Fn = DAG.getMachineFunction().getFunction();
10929 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10930 Fn, "ds_ordered_count: wave_done requires wave_release",
10931 DL.getDebugLoc()));
10932 }
10933
10934 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10935 unsigned ShaderType =
10937 unsigned Offset0 = OrderedCountIndex << 2;
10938 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10939
10940 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10941 Offset1 |= (CountDw - 1) << 6;
10942
10943 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10944 Offset1 |= ShaderType << 2;
10945
10946 unsigned Offset = Offset0 | (Offset1 << 8);
10947
10948 SDValue Ops[] = {
10949 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10950 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10951 };
10952 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10953 M->getVTList(), Ops, M->getMemoryVT(),
10954 M->getMemOperand());
10955 }
10956 case Intrinsic::amdgcn_raw_buffer_load:
10957 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10958 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10959 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10960 case Intrinsic::amdgcn_raw_buffer_load_format:
10961 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10962 const bool IsFormat =
10963 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10964 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10965
10966 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10967 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10968 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10969 SDValue Ops[] = {
10970 Op.getOperand(0), // Chain
10971 Rsrc, // rsrc
10972 DAG.getConstant(0, DL, MVT::i32), // vindex
10973 VOffset, // voffset
10974 SOffset, // soffset
10975 Offset, // offset
10976 Op.getOperand(5), // cachepolicy, swizzled buffer
10977 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10978 };
10979
10980 auto *M = cast<MemSDNode>(Op);
10981 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10982 }
10983 case Intrinsic::amdgcn_struct_buffer_load:
10984 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10985 case Intrinsic::amdgcn_struct_buffer_load_format:
10986 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10987 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10988 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10989 const bool IsFormat =
10990 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10991 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10992
10993 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10994 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10995 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10996 SDValue Ops[] = {
10997 Op.getOperand(0), // Chain
10998 Rsrc, // rsrc
10999 Op.getOperand(3), // vindex
11000 VOffset, // voffset
11001 SOffset, // soffset
11002 Offset, // offset
11003 Op.getOperand(6), // cachepolicy, swizzled buffer
11004 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11005 };
11006
11007 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
11008 }
11009 case Intrinsic::amdgcn_raw_tbuffer_load:
11010 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
11011 MemSDNode *M = cast<MemSDNode>(Op);
11012 EVT LoadVT = Op.getValueType();
11013 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11014 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
11015 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
11016
11017 SDValue Ops[] = {
11018 Op.getOperand(0), // Chain
11019 Rsrc, // rsrc
11020 DAG.getConstant(0, DL, MVT::i32), // vindex
11021 VOffset, // voffset
11022 SOffset, // soffset
11023 Offset, // offset
11024 Op.getOperand(5), // format
11025 Op.getOperand(6), // cachepolicy, swizzled buffer
11026 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11027 };
11028
11029 if (LoadVT.getScalarType() == MVT::f16)
11030 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11031 Ops);
11032 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
11033 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
11034 DAG);
11035 }
11036 case Intrinsic::amdgcn_struct_tbuffer_load:
11037 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
11038 MemSDNode *M = cast<MemSDNode>(Op);
11039 EVT LoadVT = Op.getValueType();
11040 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11041 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11042 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11043
11044 SDValue Ops[] = {
11045 Op.getOperand(0), // Chain
11046 Rsrc, // rsrc
11047 Op.getOperand(3), // vindex
11048 VOffset, // voffset
11049 SOffset, // soffset
11050 Offset, // offset
11051 Op.getOperand(6), // format
11052 Op.getOperand(7), // cachepolicy, swizzled buffer
11053 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11054 };
11055
11056 if (LoadVT.getScalarType() == MVT::f16)
11057 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11058 Ops);
11059 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
11060 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
11061 DAG);
11062 }
11063 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
11064 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
11065 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
11066 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
11067 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
11068 return lowerStructBufferAtomicIntrin(Op, DAG,
11069 AMDGPUISD::BUFFER_ATOMIC_FADD);
11070 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
11071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
11072 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
11073 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
11074 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
11075 return lowerStructBufferAtomicIntrin(Op, DAG,
11076 AMDGPUISD::BUFFER_ATOMIC_FMIN);
11077 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
11078 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
11079 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
11080 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
11081 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
11082 return lowerStructBufferAtomicIntrin(Op, DAG,
11083 AMDGPUISD::BUFFER_ATOMIC_FMAX);
11084 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
11085 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
11086 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
11087 case Intrinsic::amdgcn_raw_buffer_atomic_add:
11088 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
11089 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11090 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
11091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
11092 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11093 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
11094 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
11095 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
11096 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
11097 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
11098 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
11099 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
11100 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
11101 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
11102 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
11103 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
11104 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
11105 case Intrinsic::amdgcn_raw_buffer_atomic_and:
11106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
11107 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11108 case Intrinsic::amdgcn_raw_buffer_atomic_or:
11109 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
11110 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11111 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
11112 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
11113 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11114 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
11115 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
11116 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11117 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
11118 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
11119 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11120 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
11121 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
11122 return lowerStructBufferAtomicIntrin(Op, DAG,
11123 AMDGPUISD::BUFFER_ATOMIC_SWAP);
11124 case Intrinsic::amdgcn_struct_buffer_atomic_add:
11125 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
11126 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11127 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
11128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
11129 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11130 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
11131 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
11132 return lowerStructBufferAtomicIntrin(Op, DAG,
11133 AMDGPUISD::BUFFER_ATOMIC_SMIN);
11134 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
11135 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
11136 return lowerStructBufferAtomicIntrin(Op, DAG,
11137 AMDGPUISD::BUFFER_ATOMIC_UMIN);
11138 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
11139 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
11140 return lowerStructBufferAtomicIntrin(Op, DAG,
11141 AMDGPUISD::BUFFER_ATOMIC_SMAX);
11142 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
11143 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
11144 return lowerStructBufferAtomicIntrin(Op, DAG,
11145 AMDGPUISD::BUFFER_ATOMIC_UMAX);
11146 case Intrinsic::amdgcn_struct_buffer_atomic_and:
11147 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
11148 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11149 case Intrinsic::amdgcn_struct_buffer_atomic_or:
11150 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
11151 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11152 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
11153 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
11154 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11155 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
11156 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
11157 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11158 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
11159 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
11160 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11161 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
11162 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
11163 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
11164 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
11165 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
11166 return lowerStructBufferAtomicIntrin(Op, DAG,
11167 AMDGPUISD::BUFFER_ATOMIC_CSUB);
11168 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
11169 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
11170 return lowerRawBufferAtomicIntrin(Op, DAG,
11171 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11172 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
11173 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
11174 return lowerStructBufferAtomicIntrin(Op, DAG,
11175 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11176 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
11177 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
11178 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
11179 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11180 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11181 SDValue Ops[] = {
11182 Op.getOperand(0), // Chain
11183 Op.getOperand(2), // src
11184 Op.getOperand(3), // cmp
11185 Rsrc, // rsrc
11186 DAG.getConstant(0, DL, MVT::i32), // vindex
11187 VOffset, // voffset
11188 SOffset, // soffset
11189 Offset, // offset
11190 Op.getOperand(7), // cachepolicy
11191 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11192 };
11193 EVT VT = Op.getValueType();
11194 auto *M = cast<MemSDNode>(Op);
11195
11196 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11197 Op->getVTList(), Ops, VT,
11198 M->getMemOperand());
11199 }
11200 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11201 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11202 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
11203 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
11204 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
11205 SDValue Ops[] = {
11206 Op.getOperand(0), // Chain
11207 Op.getOperand(2), // src
11208 Op.getOperand(3), // cmp
11209 Rsrc, // rsrc
11210 Op.getOperand(5), // vindex
11211 VOffset, // voffset
11212 SOffset, // soffset
11213 Offset, // offset
11214 Op.getOperand(8), // cachepolicy
11215 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11216 };
11217 EVT VT = Op.getValueType();
11218 auto *M = cast<MemSDNode>(Op);
11219
11220 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11221 Op->getVTList(), Ops, VT,
11222 M->getMemOperand());
11223 }
11224 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11225 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11226 MemSDNode *M = cast<MemSDNode>(Op);
11227 SDValue NodePtr = M->getOperand(2);
11228 SDValue RayExtent = M->getOperand(3);
11229 SDValue InstanceMask = M->getOperand(4);
11230 SDValue RayOrigin = M->getOperand(5);
11231 SDValue RayDir = M->getOperand(6);
11232 SDValue Offsets = M->getOperand(7);
11233 SDValue TDescr = M->getOperand(8);
11234
11235 assert(NodePtr.getValueType() == MVT::i64);
11236 assert(RayDir.getValueType() == MVT::v3f32);
11237
11238 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11239 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11240 return SDValue();
11241 }
11242
11243 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11244 const unsigned NumVDataDwords = 10;
11245 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11246 int Opcode = AMDGPU::getMIMGOpcode(
11247 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11248 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11249 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11250 assert(Opcode != -1);
11251
11253 Ops.push_back(NodePtr);
11254 Ops.push_back(DAG.getBuildVector(
11255 MVT::v2i32, DL,
11256 {DAG.getBitcast(MVT::i32, RayExtent),
11257 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11258 Ops.push_back(RayOrigin);
11259 Ops.push_back(RayDir);
11260 Ops.push_back(Offsets);
11261 Ops.push_back(TDescr);
11262 Ops.push_back(M->getChain());
11263
11264 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11265 MachineMemOperand *MemRef = M->getMemOperand();
11266 DAG.setNodeMemRefs(NewNode, {MemRef});
11267 return SDValue(NewNode, 0);
11268 }
11269 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11270 MemSDNode *M = cast<MemSDNode>(Op);
11271 SDValue NodePtr = M->getOperand(2);
11272 SDValue RayExtent = M->getOperand(3);
11273 SDValue RayOrigin = M->getOperand(4);
11274 SDValue RayDir = M->getOperand(5);
11275 SDValue RayInvDir = M->getOperand(6);
11276 SDValue TDescr = M->getOperand(7);
11277
11278 assert(NodePtr.getValueType() == MVT::i32 ||
11279 NodePtr.getValueType() == MVT::i64);
11280 assert(RayDir.getValueType() == MVT::v3f16 ||
11281 RayDir.getValueType() == MVT::v3f32);
11282
11283 if (!Subtarget->hasGFX10_AEncoding()) {
11284 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11285 return SDValue();
11286 }
11287
11288 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
11289 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
11290 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11291 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11292 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11293 const unsigned NumVDataDwords = 4;
11294 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11295 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11296 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11297 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11298 IsGFX12Plus;
11299 const unsigned BaseOpcodes[2][2] = {
11300 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11301 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11302 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11303 int Opcode;
11304 if (UseNSA) {
11305 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11306 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11307 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11308 : AMDGPU::MIMGEncGfx10NSA,
11309 NumVDataDwords, NumVAddrDwords);
11310 } else {
11311 assert(!IsGFX12Plus);
11312 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11313 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11314 : AMDGPU::MIMGEncGfx10Default,
11315 NumVDataDwords, NumVAddrDwords);
11316 }
11317 assert(Opcode != -1);
11318
11320
11321 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11323 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
11324 if (Lanes[0].getValueSizeInBits() == 32) {
11325 for (unsigned I = 0; I < 3; ++I)
11326 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
11327 } else {
11328 if (IsAligned) {
11329 Ops.push_back(DAG.getBitcast(
11330 MVT::i32,
11331 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
11332 Ops.push_back(Lanes[2]);
11333 } else {
11334 SDValue Elt0 = Ops.pop_back_val();
11335 Ops.push_back(DAG.getBitcast(
11336 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
11337 Ops.push_back(DAG.getBitcast(
11338 MVT::i32,
11339 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
11340 }
11341 }
11342 };
11343
11344 if (UseNSA && IsGFX11Plus) {
11345 Ops.push_back(NodePtr);
11346 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11347 Ops.push_back(RayOrigin);
11348 if (IsA16) {
11349 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11350 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
11351 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
11352 for (unsigned I = 0; I < 3; ++I) {
11353 MergedLanes.push_back(DAG.getBitcast(
11354 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
11355 {DirLanes[I], InvDirLanes[I]})));
11356 }
11357 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
11358 } else {
11359 Ops.push_back(RayDir);
11360 Ops.push_back(RayInvDir);
11361 }
11362 } else {
11363 if (Is64)
11364 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
11365 2);
11366 else
11367 Ops.push_back(NodePtr);
11368
11369 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11370 packLanes(RayOrigin, true);
11371 packLanes(RayDir, true);
11372 packLanes(RayInvDir, false);
11373 }
11374
11375 if (!UseNSA) {
11376 // Build a single vector containing all the operands so far prepared.
11377 if (NumVAddrDwords > 12) {
11378 SDValue Undef = DAG.getPOISON(MVT::i32);
11379 Ops.append(16 - Ops.size(), Undef);
11380 }
11381 assert(Ops.size() >= 8 && Ops.size() <= 12);
11382 SDValue MergedOps =
11383 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
11384 Ops.clear();
11385 Ops.push_back(MergedOps);
11386 }
11387
11388 Ops.push_back(TDescr);
11389 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
11390 Ops.push_back(M->getChain());
11391
11392 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11393 MachineMemOperand *MemRef = M->getMemOperand();
11394 DAG.setNodeMemRefs(NewNode, {MemRef});
11395 return SDValue(NewNode, 0);
11396 }
11397 case Intrinsic::amdgcn_global_atomic_fmin_num:
11398 case Intrinsic::amdgcn_global_atomic_fmax_num:
11399 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11400 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11401 MemSDNode *M = cast<MemSDNode>(Op);
11402 SDValue Ops[] = {
11403 M->getOperand(0), // Chain
11404 M->getOperand(2), // Ptr
11405 M->getOperand(3) // Value
11406 };
11407 unsigned Opcode = 0;
11408 switch (IntrID) {
11409 case Intrinsic::amdgcn_global_atomic_fmin_num:
11410 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11411 Opcode = ISD::ATOMIC_LOAD_FMIN;
11412 break;
11413 }
11414 case Intrinsic::amdgcn_global_atomic_fmax_num:
11415 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11416 Opcode = ISD::ATOMIC_LOAD_FMAX;
11417 break;
11418 }
11419 default:
11420 llvm_unreachable("unhandled atomic opcode");
11421 }
11422 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
11423 Ops, M->getMemOperand());
11424 }
11425 case Intrinsic::amdgcn_s_alloc_vgpr: {
11426 SDValue NumVGPRs = Op.getOperand(2);
11427 if (!NumVGPRs->isDivergent())
11428 return Op;
11429
11430 SDValue ReadFirstLaneID =
11431 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
11432 NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
11433 ReadFirstLaneID, NumVGPRs);
11434
11435 return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
11436 Op.getOperand(0), Op.getOperand(1), NumVGPRs);
11437 }
11438 case Intrinsic::amdgcn_s_get_barrier_state:
11439 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11440 SDValue Chain = Op->getOperand(0);
11442 unsigned Opc;
11443
11444 if (isa<ConstantSDNode>(Op->getOperand(2))) {
11445 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
11446 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11447 BarID = (BarID >> 4) & 0x3F;
11448 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11449 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11450 Ops.push_back(K);
11451 Ops.push_back(Chain);
11452 } else {
11453 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11454 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11455 SDValue M0Val;
11456 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
11457 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11458 M0Val = SDValue(
11459 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11460 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11461 0);
11462 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11463 } else
11464 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
11465 }
11466
11467 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11468 return SDValue(NewMI, 0);
11469 }
11470 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11471 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11472 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11473 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11474 SDValue Chain = Op->getOperand(0);
11475 SDValue Ptr = Op->getOperand(2);
11476 EVT VT = Op->getValueType(0);
11477 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
11478 Chain, Ptr, MII->getMemOperand());
11479 }
11480 case Intrinsic::amdgcn_flat_load_monitor_b32:
11481 case Intrinsic::amdgcn_flat_load_monitor_b64:
11482 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11483 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11484 SDValue Chain = Op->getOperand(0);
11485 SDValue Ptr = Op->getOperand(2);
11486 return DAG.getMemIntrinsicNode(AMDGPUISD::FLAT_LOAD_MONITOR, DL,
11487 Op->getVTList(), {Chain, Ptr},
11488 MII->getMemoryVT(), MII->getMemOperand());
11489 }
11490 case Intrinsic::amdgcn_global_load_monitor_b32:
11491 case Intrinsic::amdgcn_global_load_monitor_b64:
11492 case Intrinsic::amdgcn_global_load_monitor_b128: {
11493 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11494 SDValue Chain = Op->getOperand(0);
11495 SDValue Ptr = Op->getOperand(2);
11496 return DAG.getMemIntrinsicNode(AMDGPUISD::GLOBAL_LOAD_MONITOR, DL,
11497 Op->getVTList(), {Chain, Ptr},
11498 MII->getMemoryVT(), MII->getMemOperand());
11499 }
11500 default:
11501
11502 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11504 return lowerImage(Op, ImageDimIntr, DAG, true);
11505
11506 return SDValue();
11507 }
11508}
11509
11510// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11511// dwordx4 if on SI and handle TFE loads.
11512SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11513 SDVTList VTList,
11514 ArrayRef<SDValue> Ops, EVT MemVT,
11515 MachineMemOperand *MMO,
11516 SelectionDAG &DAG) const {
11517 LLVMContext &C = *DAG.getContext();
11518 MachineFunction &MF = DAG.getMachineFunction();
11519 EVT VT = VTList.VTs[0];
11520
11521 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11522 bool IsTFE = VTList.NumVTs == 3;
11523 if (IsTFE) {
11524 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
11525 unsigned NumOpDWords = NumValueDWords + 1;
11526 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
11527 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
11528 MachineMemOperand *OpDWordsMMO =
11529 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
11530 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
11531 OpDWordsVT, OpDWordsMMO, DAG);
11532 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11533 DAG.getVectorIdxConstant(NumValueDWords, DL));
11534 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
11535 SDValue ValueDWords =
11536 NumValueDWords == 1
11537 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
11539 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
11540 ZeroIdx);
11541 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
11542 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11543 }
11544
11545 if (!Subtarget->hasDwordx3LoadStores() &&
11546 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11547 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
11548 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
11549 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
11550 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
11551 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
11552 WidenedMemVT, WidenedMMO);
11554 DAG.getVectorIdxConstant(0, DL));
11555 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
11556 }
11557
11558 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
11559}
11560
11561SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11562 bool ImageStore) const {
11563 EVT StoreVT = VData.getValueType();
11564
11565 // No change for f16 and legal vector D16 types.
11566 if (!StoreVT.isVector())
11567 return VData;
11568
11569 SDLoc DL(VData);
11570 unsigned NumElements = StoreVT.getVectorNumElements();
11571
11572 if (Subtarget->hasUnpackedD16VMem()) {
11573 // We need to unpack the packed data to store.
11574 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11575 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11576
11577 EVT EquivStoreVT =
11578 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11579 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11580 return DAG.UnrollVectorOp(ZExt.getNode());
11581 }
11582
11583 // The sq block of gfx8.1 does not estimate register use correctly for d16
11584 // image store instructions. The data operand is computed as if it were not a
11585 // d16 image instruction.
11586 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11587 // Bitcast to i16
11588 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11589 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11590
11591 // Decompose into scalars
11593 DAG.ExtractVectorElements(IntVData, Elts);
11594
11595 // Group pairs of i16 into v2i16 and bitcast to i32
11596 SmallVector<SDValue, 4> PackedElts;
11597 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11598 SDValue Pair =
11599 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11600 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11601 PackedElts.push_back(IntPair);
11602 }
11603 if ((NumElements % 2) == 1) {
11604 // Handle v3i16
11605 unsigned I = Elts.size() / 2;
11606 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11607 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11608 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11609 PackedElts.push_back(IntPair);
11610 }
11611
11612 // Pad using UNDEF
11613 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11614
11615 // Build final vector
11616 EVT VecVT =
11617 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11618 return DAG.getBuildVector(VecVT, DL, PackedElts);
11619 }
11620
11621 if (NumElements == 3) {
11622 EVT IntStoreVT =
11624 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11625
11626 EVT WidenedStoreVT = EVT::getVectorVT(
11627 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11628 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11629 WidenedStoreVT.getStoreSizeInBits());
11630 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11631 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11632 }
11633
11634 assert(isTypeLegal(StoreVT));
11635 return VData;
11636}
11637
11638static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
11639 switch (Intr) {
11640 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11641 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11642 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11643 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11644 case Intrinsic::amdgcn_load_async_to_lds:
11645 case Intrinsic::amdgcn_global_load_async_lds:
11646 return true;
11647 }
11648 return false;
11649}
11650
11651SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11652 SelectionDAG &DAG) const {
11653 SDLoc DL(Op);
11654 SDValue Chain = Op.getOperand(0);
11655 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11656
11657 switch (IntrinsicID) {
11658 case Intrinsic::amdgcn_exp_compr: {
11659 if (!Subtarget->hasCompressedExport()) {
11660 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11662 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11663 }
11664 SDValue Src0 = Op.getOperand(4);
11665 SDValue Src1 = Op.getOperand(5);
11666 // Hack around illegal type on SI by directly selecting it.
11667 if (isTypeLegal(Src0.getValueType()))
11668 return SDValue();
11669
11670 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11671 SDValue Undef = DAG.getPOISON(MVT::f32);
11672 const SDValue Ops[] = {
11673 Op.getOperand(2), // tgt
11674 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11675 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11676 Undef, // src2
11677 Undef, // src3
11678 Op.getOperand(7), // vm
11679 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11680 Op.getOperand(3), // en
11681 Op.getOperand(0) // Chain
11682 };
11683
11684 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11685 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11686 }
11687
11688 case Intrinsic::amdgcn_struct_tbuffer_store:
11689 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11690 SDValue VData = Op.getOperand(2);
11691 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11692 if (IsD16)
11693 VData = handleD16VData(VData, DAG);
11694 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11695 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11696 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11697 SDValue Ops[] = {
11698 Chain,
11699 VData, // vdata
11700 Rsrc, // rsrc
11701 Op.getOperand(4), // vindex
11702 VOffset, // voffset
11703 SOffset, // soffset
11704 Offset, // offset
11705 Op.getOperand(7), // format
11706 Op.getOperand(8), // cachepolicy, swizzled buffer
11707 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11708 };
11709 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11710 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11711 MemSDNode *M = cast<MemSDNode>(Op);
11712 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11713 M->getMemoryVT(), M->getMemOperand());
11714 }
11715
11716 case Intrinsic::amdgcn_raw_tbuffer_store:
11717 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11718 SDValue VData = Op.getOperand(2);
11719 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11720 if (IsD16)
11721 VData = handleD16VData(VData, DAG);
11722 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11723 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11724 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11725 SDValue Ops[] = {
11726 Chain,
11727 VData, // vdata
11728 Rsrc, // rsrc
11729 DAG.getConstant(0, DL, MVT::i32), // vindex
11730 VOffset, // voffset
11731 SOffset, // soffset
11732 Offset, // offset
11733 Op.getOperand(6), // format
11734 Op.getOperand(7), // cachepolicy, swizzled buffer
11735 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11736 };
11737 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11738 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11739 MemSDNode *M = cast<MemSDNode>(Op);
11740 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11741 M->getMemoryVT(), M->getMemOperand());
11742 }
11743
11744 case Intrinsic::amdgcn_raw_buffer_store:
11745 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11746 case Intrinsic::amdgcn_raw_buffer_store_format:
11747 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11748 const bool IsFormat =
11749 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11750 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11751
11752 SDValue VData = Op.getOperand(2);
11753 EVT VDataVT = VData.getValueType();
11754 EVT EltType = VDataVT.getScalarType();
11755 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11756 if (IsD16) {
11757 VData = handleD16VData(VData, DAG);
11758 VDataVT = VData.getValueType();
11759 }
11760
11761 if (!isTypeLegal(VDataVT)) {
11762 VData =
11763 DAG.getNode(ISD::BITCAST, DL,
11764 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11765 }
11766
11767 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11768 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11769 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11770 SDValue Ops[] = {
11771 Chain,
11772 VData,
11773 Rsrc,
11774 DAG.getConstant(0, DL, MVT::i32), // vindex
11775 VOffset, // voffset
11776 SOffset, // soffset
11777 Offset, // offset
11778 Op.getOperand(6), // cachepolicy, swizzled buffer
11779 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11780 };
11781 unsigned Opc =
11782 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11783 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11784 MemSDNode *M = cast<MemSDNode>(Op);
11785
11786 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11787 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11788 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11789
11790 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11791 M->getMemoryVT(), M->getMemOperand());
11792 }
11793
11794 case Intrinsic::amdgcn_struct_buffer_store:
11795 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11796 case Intrinsic::amdgcn_struct_buffer_store_format:
11797 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11798 const bool IsFormat =
11799 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11800 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11801
11802 SDValue VData = Op.getOperand(2);
11803 EVT VDataVT = VData.getValueType();
11804 EVT EltType = VDataVT.getScalarType();
11805 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11806
11807 if (IsD16) {
11808 VData = handleD16VData(VData, DAG);
11809 VDataVT = VData.getValueType();
11810 }
11811
11812 if (!isTypeLegal(VDataVT)) {
11813 VData =
11814 DAG.getNode(ISD::BITCAST, DL,
11815 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11816 }
11817
11818 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11819 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11820 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11821 SDValue Ops[] = {
11822 Chain,
11823 VData,
11824 Rsrc,
11825 Op.getOperand(4), // vindex
11826 VOffset, // voffset
11827 SOffset, // soffset
11828 Offset, // offset
11829 Op.getOperand(7), // cachepolicy, swizzled buffer
11830 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11831 };
11832 unsigned Opc =
11833 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11834 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11835 MemSDNode *M = cast<MemSDNode>(Op);
11836
11837 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11838 EVT VDataType = VData.getValueType().getScalarType();
11839 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11840 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11841
11842 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11843 M->getMemoryVT(), M->getMemOperand());
11844 }
11845 case Intrinsic::amdgcn_raw_buffer_load_lds:
11846 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11847 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11848 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11849 case Intrinsic::amdgcn_struct_buffer_load_lds:
11850 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11851 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
11852 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
11853 if (!Subtarget->hasVMemToLDSLoad())
11854 return SDValue();
11855 unsigned Opc;
11856 bool HasVIndex =
11857 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11858 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
11859 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
11860 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
11861 unsigned OpOffset = HasVIndex ? 1 : 0;
11862 SDValue VOffset = Op.getOperand(5 + OpOffset);
11863 bool HasVOffset = !isNullConstant(VOffset);
11864 unsigned Size = Op->getConstantOperandVal(4);
11865
11866 switch (Size) {
11867 default:
11868 return SDValue();
11869 case 1:
11870 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11871 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11872 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11873 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11874 break;
11875 case 2:
11876 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11877 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11878 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11879 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11880 break;
11881 case 4:
11882 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11883 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11884 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11885 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11886 break;
11887 case 12:
11888 if (!Subtarget->hasLDSLoadB96_B128())
11889 return SDValue();
11890 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11891 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11892 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11893 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11894 break;
11895 case 16:
11896 if (!Subtarget->hasLDSLoadB96_B128())
11897 return SDValue();
11898 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11899 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11900 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11901 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11902 break;
11903 }
11904
11905 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11906
11908
11909 if (HasVIndex && HasVOffset)
11910 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11911 {Op.getOperand(5), // VIndex
11912 VOffset}));
11913 else if (HasVIndex)
11914 Ops.push_back(Op.getOperand(5));
11915 else if (HasVOffset)
11916 Ops.push_back(VOffset);
11917
11918 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11919 Ops.push_back(Rsrc);
11920 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11921 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11922 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11923 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11924 Ops.push_back(DAG.getTargetConstant(
11925 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11926 DL, MVT::i8)); // cpol
11927 Ops.push_back(DAG.getTargetConstant(
11928 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11929 ? 1
11930 : 0,
11931 DL, MVT::i8)); // swz
11932 Ops.push_back(
11933 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
11934 Ops.push_back(M0Val.getValue(0)); // Chain
11935 Ops.push_back(M0Val.getValue(1)); // Glue
11936
11937 auto *M = cast<MemSDNode>(Op);
11938 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11939 DAG.setNodeMemRefs(Load, M->memoperands());
11940
11941 return SDValue(Load, 0);
11942 }
11943 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11944 // for "trust me" that the remaining cases are global pointers until
11945 // such time as we can put two mem operands on an intrinsic.
11946 case Intrinsic::amdgcn_load_to_lds:
11947 case Intrinsic::amdgcn_load_async_to_lds:
11948 case Intrinsic::amdgcn_global_load_lds:
11949 case Intrinsic::amdgcn_global_load_async_lds: {
11950 if (!Subtarget->hasVMemToLDSLoad())
11951 return SDValue();
11952
11953 unsigned Opc;
11954 unsigned Size = Op->getConstantOperandVal(4);
11955 switch (Size) {
11956 default:
11957 return SDValue();
11958 case 1:
11959 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11960 break;
11961 case 2:
11962 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11963 break;
11964 case 4:
11965 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11966 break;
11967 case 12:
11968 if (!Subtarget->hasLDSLoadB96_B128())
11969 return SDValue();
11970 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11971 break;
11972 case 16:
11973 if (!Subtarget->hasLDSLoadB96_B128())
11974 return SDValue();
11975 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11976 break;
11977 }
11978
11979 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11980
11982
11983 SDValue Addr = Op.getOperand(2); // Global ptr
11984 SDValue VOffset;
11985 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11986 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11987 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11988 SDValue LHS = Addr.getOperand(0);
11989 SDValue RHS = Addr.getOperand(1);
11990
11991 if (LHS->isDivergent())
11992 std::swap(LHS, RHS);
11993
11994 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11995 RHS.getOperand(0).getValueType() == MVT::i32) {
11996 // add (i64 sgpr), (zero_extend (i32 vgpr))
11997 Addr = LHS;
11998 VOffset = RHS.getOperand(0);
11999 }
12000 }
12001
12002 Ops.push_back(Addr);
12003 if (!Addr->isDivergent()) {
12005 if (!VOffset)
12006 VOffset =
12007 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
12008 DAG.getTargetConstant(0, DL, MVT::i32)),
12009 0);
12010 Ops.push_back(VOffset);
12011 }
12012
12013 Ops.push_back(Op.getOperand(5)); // Offset
12014
12015 unsigned Aux = Op.getConstantOperandVal(6);
12016 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
12017 MVT::i32)); // CPol
12018 Ops.push_back(
12019 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
12020
12021 Ops.push_back(M0Val.getValue(0)); // Chain
12022 Ops.push_back(M0Val.getValue(1)); // Glue
12023
12024 auto *M = cast<MemSDNode>(Op);
12025 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12026 DAG.setNodeMemRefs(Load, M->memoperands());
12027
12028 return SDValue(Load, 0);
12029 }
12030 case Intrinsic::amdgcn_end_cf:
12031 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
12032 Op->getOperand(2), Chain),
12033 0);
12034 case Intrinsic::amdgcn_s_barrier_init:
12035 case Intrinsic::amdgcn_s_barrier_signal_var: {
12036 // these two intrinsics have two operands: barrier pointer and member count
12037 SDValue Chain = Op->getOperand(0);
12039 SDValue BarOp = Op->getOperand(2);
12040 SDValue CntOp = Op->getOperand(3);
12041 SDValue M0Val;
12042 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
12043 ? AMDGPU::S_BARRIER_INIT_M0
12044 : AMDGPU::S_BARRIER_SIGNAL_M0;
12045 // extract the BarrierID from bits 4-9 of BarOp
12046 SDValue BarID;
12047 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
12048 DAG.getShiftAmountConstant(4, MVT::i32, DL));
12049 BarID =
12050 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
12051 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12052 0);
12053 // Member count should be put into M0[ShAmt:+6]
12054 // Barrier ID should be put into M0[5:0]
12055 M0Val =
12056 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
12057 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12058 0);
12059 constexpr unsigned ShAmt = 16;
12060 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
12061 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
12062
12063 M0Val = SDValue(
12064 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
12065
12066 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
12067
12068 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12069 return SDValue(NewMI, 0);
12070 }
12071 case Intrinsic::amdgcn_s_wakeup_barrier: {
12072 if (!Subtarget->hasSWakeupBarrier())
12073 return SDValue();
12074 [[fallthrough]];
12075 }
12076 case Intrinsic::amdgcn_s_barrier_join: {
12077 // these three intrinsics have one operand: barrier pointer
12078 SDValue Chain = Op->getOperand(0);
12080 SDValue BarOp = Op->getOperand(2);
12081 unsigned Opc;
12082
12083 if (isa<ConstantSDNode>(BarOp)) {
12084 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
12085 switch (IntrinsicID) {
12086 default:
12087 return SDValue();
12088 case Intrinsic::amdgcn_s_barrier_join:
12089 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
12090 break;
12091 case Intrinsic::amdgcn_s_wakeup_barrier:
12092 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
12093 break;
12094 }
12095 // extract the BarrierID from bits 4-9 of the immediate
12096 unsigned BarID = (BarVal >> 4) & 0x3F;
12097 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
12098 Ops.push_back(K);
12099 Ops.push_back(Chain);
12100 } else {
12101 switch (IntrinsicID) {
12102 default:
12103 return SDValue();
12104 case Intrinsic::amdgcn_s_barrier_join:
12105 Opc = AMDGPU::S_BARRIER_JOIN_M0;
12106 break;
12107 case Intrinsic::amdgcn_s_wakeup_barrier:
12108 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
12109 break;
12110 }
12111 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
12112 SDValue M0Val;
12113 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
12114 DAG.getShiftAmountConstant(4, MVT::i32, DL));
12115 M0Val =
12116 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
12117 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12118 0);
12119 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
12120 }
12121
12122 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12123 return SDValue(NewMI, 0);
12124 }
12125 case Intrinsic::amdgcn_s_prefetch_data: {
12126 // For non-global address space preserve the chain and remove the call.
12128 return Op.getOperand(0);
12129 return Op;
12130 }
12131 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
12132 SDValue Ops[] = {
12133 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
12134 Op.getOperand(3), // offset
12135 Op.getOperand(4), // length
12136 };
12137
12138 MemSDNode *M = cast<MemSDNode>(Op);
12139 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
12140 Op->getVTList(), Ops, M->getMemoryVT(),
12141 M->getMemOperand());
12142 }
12143 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
12144 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
12145 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
12146 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
12147 SDValue Chain = Op->getOperand(0);
12148 SDValue Ptr = Op->getOperand(2);
12149 SDValue Val = Op->getOperand(3);
12150 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
12151 Ptr, MII->getMemOperand());
12152 }
12153 default: {
12154 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12156 return lowerImage(Op, ImageDimIntr, DAG, true);
12157
12158 return Op;
12159 }
12160 }
12161}
12162
12163// Return whether the operation has NoUnsignedWrap property.
12164static bool isNoUnsignedWrap(SDValue Addr) {
12165 return (Addr.getOpcode() == ISD::ADD &&
12166 Addr->getFlags().hasNoUnsignedWrap()) ||
12167 Addr->getOpcode() == ISD::OR;
12168}
12169
12171 EVT PtrVT) const {
12172 return PtrVT == MVT::i64;
12173}
12174
12176 EVT PtrVT) const {
12177 return true;
12178}
12179
12180// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
12181// offset (the offset that is included in bounds checking and swizzling, to be
12182// split between the instruction's voffset and immoffset fields) and soffset
12183// (the offset that is excluded from bounds checking and swizzling, to go in
12184// the instruction's soffset field). This function takes the first kind of
12185// offset and figures out how to split it between voffset and immoffset.
12186std::pair<SDValue, SDValue>
12187SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
12188 SDLoc DL(Offset);
12189 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
12190 SDValue N0 = Offset;
12191 ConstantSDNode *C1 = nullptr;
12192
12193 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
12194 N0 = SDValue();
12195 else if (DAG.isBaseWithConstantOffset(N0)) {
12196 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12197 // being added, so we can only safely match a 32-bit addition with no
12198 // unsigned overflow.
12199 bool CheckNUW = Subtarget->hasGFX1250Insts();
12200 if (!CheckNUW || isNoUnsignedWrap(N0)) {
12201 C1 = cast<ConstantSDNode>(N0.getOperand(1));
12202 N0 = N0.getOperand(0);
12203 }
12204 }
12205
12206 if (C1) {
12207 unsigned ImmOffset = C1->getZExtValue();
12208 // If the immediate value is too big for the immoffset field, put only bits
12209 // that would normally fit in the immoffset field. The remaining value that
12210 // is copied/added for the voffset field is a large power of 2, and it
12211 // stands more chance of being CSEd with the copy/add for another similar
12212 // load/store.
12213 // However, do not do that rounding down if that is a negative
12214 // number, as it appears to be illegal to have a negative offset in the
12215 // vgpr, even if adding the immediate offset makes it positive.
12216 unsigned Overflow = ImmOffset & ~MaxImm;
12217 ImmOffset -= Overflow;
12218 if ((int32_t)Overflow < 0) {
12219 Overflow += ImmOffset;
12220 ImmOffset = 0;
12221 }
12222 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
12223 if (Overflow) {
12224 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
12225 if (!N0)
12226 N0 = OverflowVal;
12227 else {
12228 SDValue Ops[] = {N0, OverflowVal};
12229 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
12230 }
12231 }
12232 }
12233 if (!N0)
12234 N0 = DAG.getConstant(0, DL, MVT::i32);
12235 if (!C1)
12236 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
12237 return {N0, SDValue(C1, 0)};
12238}
12239
12240// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
12241// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
12242// pointed to by Offsets.
12243void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
12244 SelectionDAG &DAG, SDValue *Offsets,
12245 Align Alignment) const {
12246 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12247 SDLoc DL(CombinedOffset);
12248 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
12249 uint32_t Imm = C->getZExtValue();
12250 uint32_t SOffset, ImmOffset;
12251 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12252 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
12253 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12254 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12255 return;
12256 }
12257 }
12258 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
12259 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12260 // being added, so we can only safely match a 32-bit addition with no
12261 // unsigned overflow.
12262 bool CheckNUW = Subtarget->hasGFX1250Insts();
12263 SDValue N0 = CombinedOffset.getOperand(0);
12264 SDValue N1 = CombinedOffset.getOperand(1);
12265 uint32_t SOffset, ImmOffset;
12266 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
12267 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) &&
12268 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
12269 Offsets[0] = N0;
12270 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12271 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12272 return;
12273 }
12274 }
12275
12276 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12277 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
12278 : DAG.getConstant(0, DL, MVT::i32);
12279
12280 Offsets[0] = CombinedOffset;
12281 Offsets[1] = SOffsetZero;
12282 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
12283}
12284
12285SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12286 SelectionDAG &DAG) const {
12287 if (!MaybePointer.getValueType().isScalarInteger())
12288 return MaybePointer;
12289
12290 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
12291 return Rsrc;
12292}
12293
12294// Wrap a global or flat pointer into a buffer intrinsic using the flags
12295// specified in the intrinsic.
12296SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12297 SelectionDAG &DAG) const {
12298 SDLoc Loc(Op);
12299
12300 SDValue Pointer = Op->getOperand(1);
12301 SDValue Stride = Op->getOperand(2);
12302 SDValue NumRecords = Op->getOperand(3);
12303 SDValue Flags = Op->getOperand(4);
12304
12305 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
12306 SDValue Rsrc;
12307
12308 if (Subtarget->has45BitNumRecordsBufferResource()) {
12309 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
12310 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12311 // num_records.
12312 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
12313 SDValue NumRecordsLHS =
12314 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
12315 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
12316 SDValue LowHalf =
12317 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
12318
12319 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12320 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12321 SDValue NumRecordsRHS =
12322 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
12323 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
12324 SDValue ShiftedStride =
12325 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12326 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
12327 SDValue ExtShiftedStrideVec =
12328 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
12329 SDValue ExtShiftedStride =
12330 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
12331 SDValue ShiftedFlags =
12332 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
12333 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
12334 SDValue ExtShiftedFlagsVec =
12335 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
12336 SDValue ExtShiftedFlags =
12337 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
12338 SDValue CombinedFields =
12339 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12340 SDValue HighHalf =
12341 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12342
12343 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
12344 } else {
12345 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
12346 auto [LowHalf, HighHalf] =
12347 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12348 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
12349 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
12350 SDValue ShiftedStride =
12351 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12352 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
12353 SDValue NewHighHalf =
12354 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
12355
12356 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
12357 NumRecords, Flags);
12358 }
12359
12360 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
12361 return RsrcPtr;
12362}
12363
12364// Handle 8 bit and 16 bit buffer loads
12365SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12366 EVT LoadVT, SDLoc DL,
12368 MachineMemOperand *MMO,
12369 bool IsTFE) const {
12370 EVT IntVT = LoadVT.changeTypeToInteger();
12371
12372 if (IsTFE) {
12373 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12374 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12375 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12376 MachineFunction &MF = DAG.getMachineFunction();
12377 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
12378 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
12379 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
12380 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12381 DAG.getConstant(1, DL, MVT::i32));
12382 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12383 DAG.getConstant(0, DL, MVT::i32));
12384 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
12385 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
12386 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
12387 }
12388
12389 unsigned Opc = LoadVT.getScalarType() == MVT::i8
12390 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12391 : AMDGPUISD::BUFFER_LOAD_USHORT;
12392
12393 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
12394 SDValue BufferLoad =
12395 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
12396 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
12397 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
12398
12399 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
12400}
12401
12402// Handle 8 bit and 16 bit buffer stores
12403SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12404 EVT VDataType, SDLoc DL,
12405 SDValue Ops[],
12406 MemSDNode *M) const {
12407 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12408 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
12409
12410 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
12411 Ops[1] = BufferStoreExt;
12412 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12413 : AMDGPUISD::BUFFER_STORE_SHORT;
12414 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
12415 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
12416 M->getMemOperand());
12417}
12418
12420 SDValue Op, const SDLoc &SL, EVT VT) {
12421 if (VT.bitsLT(Op.getValueType()))
12422 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
12423
12424 switch (ExtType) {
12425 case ISD::SEXTLOAD:
12426 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
12427 case ISD::ZEXTLOAD:
12428 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
12429 case ISD::EXTLOAD:
12430 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
12431 case ISD::NON_EXTLOAD:
12432 return Op;
12433 }
12434
12435 llvm_unreachable("invalid ext type");
12436}
12437
12438// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
12439// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
12440SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
12441 DAGCombinerInfo &DCI) const {
12442 SelectionDAG &DAG = DCI.DAG;
12443 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
12444 return SDValue();
12445
12446 // FIXME: Constant loads should all be marked invariant.
12447 unsigned AS = Ld->getAddressSpace();
12448 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
12450 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
12451 return SDValue();
12452
12453 // Don't do this early, since it may interfere with adjacent load merging for
12454 // illegal types. We can avoid losing alignment information for exotic types
12455 // pre-legalize.
12456 EVT MemVT = Ld->getMemoryVT();
12457 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
12458 MemVT.getSizeInBits() >= 32)
12459 return SDValue();
12460
12461 SDLoc SL(Ld);
12462
12463 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
12464 "unexpected vector extload");
12465
12466 // TODO: Drop only high part of range.
12467 SDValue Ptr = Ld->getBasePtr();
12468 SDValue NewLoad = DAG.getLoad(
12469 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
12470 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
12471 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
12472 nullptr); // Drop ranges
12473
12474 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
12475 if (MemVT.isFloatingPoint()) {
12477 "unexpected fp extload");
12478 TruncVT = MemVT.changeTypeToInteger();
12479 }
12480
12481 SDValue Cvt = NewLoad;
12482 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12483 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
12484 DAG.getValueType(TruncVT));
12485 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12487 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
12488 } else {
12490 }
12491
12492 EVT VT = Ld->getValueType(0);
12493 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
12494
12495 DCI.AddToWorklist(Cvt.getNode());
12496
12497 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12498 // the appropriate extension from the 32-bit load.
12499 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
12500 DCI.AddToWorklist(Cvt.getNode());
12501
12502 // Handle conversion back to floating point if necessary.
12503 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
12504
12505 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
12506}
12507
12509 const SIMachineFunctionInfo &Info) {
12510 // TODO: Should check if the address can definitely not access stack.
12511 if (Info.isEntryFunction())
12512 return Info.getUserSGPRInfo().hasFlatScratchInit();
12513 return true;
12514}
12515
12516SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12517 SDLoc DL(Op);
12518 LoadSDNode *Load = cast<LoadSDNode>(Op);
12519 ISD::LoadExtType ExtType = Load->getExtensionType();
12520 EVT MemVT = Load->getMemoryVT();
12521 MachineMemOperand *MMO = Load->getMemOperand();
12522
12523 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12524 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
12525 return SDValue();
12526
12527 // FIXME: Copied from PPC
12528 // First, load into 32 bits, then truncate to 1 bit.
12529
12530 SDValue Chain = Load->getChain();
12531 SDValue BasePtr = Load->getBasePtr();
12532
12533 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12534
12535 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
12536 RealMemVT, MMO);
12537
12538 if (!MemVT.isVector()) {
12539 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
12540 NewLD.getValue(1)};
12541
12542 return DAG.getMergeValues(Ops, DL);
12543 }
12544
12546 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12547 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
12548 DAG.getConstant(I, DL, MVT::i32));
12549
12550 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12551 }
12552
12553 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12554
12555 return DAG.getMergeValues(Ops, DL);
12556 }
12557
12558 if (!MemVT.isVector())
12559 return SDValue();
12560
12561 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12562 "Custom lowering for non-i32 vectors hasn't been implemented.");
12563
12564 Align Alignment = Load->getAlign();
12565 unsigned AS = Load->getAddressSpace();
12566 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12567 AS == AMDGPUAS::FLAT_ADDRESS &&
12568 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12569 return SplitVectorLoad(Op, DAG);
12570 }
12571
12572 MachineFunction &MF = DAG.getMachineFunction();
12573 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12574 // If there is a possibility that flat instruction access scratch memory
12575 // then we need to use the same legalization rules we use for private.
12576 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12577 !Subtarget->hasMultiDwordFlatScratchAddressing())
12578 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12581
12582 unsigned NumElements = MemVT.getVectorNumElements();
12583
12584 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12586 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12587 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12588 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12589 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12590 Alignment >= Align(4) && NumElements < 32) {
12591 if (MemVT.isPow2VectorType() ||
12592 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12593 return SDValue();
12594 return WidenOrSplitVectorLoad(Op, DAG);
12595 }
12596 // Non-uniform loads will be selected to MUBUF instructions, so they
12597 // have the same legalization requirements as global and private
12598 // loads.
12599 //
12600 }
12601 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12604 if (NumElements > 4)
12605 return SplitVectorLoad(Op, DAG);
12606 // v3 loads not supported on SI.
12607 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12608 return WidenOrSplitVectorLoad(Op, DAG);
12609
12610 // v3 and v4 loads are supported for private and global memory.
12611 return SDValue();
12612 }
12613 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12614 // Depending on the setting of the private_element_size field in the
12615 // resource descriptor, we can only make private accesses up to a certain
12616 // size.
12617 switch (Subtarget->getMaxPrivateElementSize()) {
12618 case 4: {
12619 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12620 return DAG.getMergeValues({Op0, Op1}, DL);
12621 }
12622 case 8:
12623 if (NumElements > 2)
12624 return SplitVectorLoad(Op, DAG);
12625 return SDValue();
12626 case 16:
12627 // Same as global/flat
12628 if (NumElements > 4)
12629 return SplitVectorLoad(Op, DAG);
12630 // v3 loads not supported on SI.
12631 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12632 return WidenOrSplitVectorLoad(Op, DAG);
12633
12634 return SDValue();
12635 default:
12636 llvm_unreachable("unsupported private_element_size");
12637 }
12638 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12639 unsigned Fast = 0;
12640 auto Flags = Load->getMemOperand()->getFlags();
12642 Load->getAlign(), Flags, &Fast) &&
12643 Fast > 1)
12644 return SDValue();
12645
12646 if (MemVT.isVector())
12647 return SplitVectorLoad(Op, DAG);
12648 }
12649
12651 MemVT, *Load->getMemOperand())) {
12652 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12653 return DAG.getMergeValues({Op0, Op1}, DL);
12654 }
12655
12656 return SDValue();
12657}
12658
12659SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12660 EVT VT = Op.getValueType();
12661 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12662 VT.getSizeInBits() == 512)
12663 return splitTernaryVectorOp(Op, DAG);
12664
12665 assert(VT.getSizeInBits() == 64);
12666
12667 SDLoc DL(Op);
12668 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12669
12670 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12671 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12672
12673 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12674 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12675
12676 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12677 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12678
12679 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12680
12681 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12682 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12683
12684 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12685
12686 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12687 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12688}
12689
12690// Catch division cases where we can use shortcuts with rcp and rsq
12691// instructions.
12692SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12693 SelectionDAG &DAG) const {
12694 SDLoc SL(Op);
12695 SDValue LHS = Op.getOperand(0);
12696 SDValue RHS = Op.getOperand(1);
12697 EVT VT = Op.getValueType();
12698 const SDNodeFlags Flags = Op->getFlags();
12699
12700 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12701
12702 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12703 // Without !fpmath accuracy information, we can't do more because we don't
12704 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12705 // f16 is always accurate enough
12706 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12707 return SDValue();
12708
12709 if (CLHS->isExactlyValue(1.0)) {
12710 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12711 // the CI documentation has a worst case error of 1 ulp.
12712 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12713 // use it as long as we aren't trying to use denormals.
12714 //
12715 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12716
12717 // 1.0 / sqrt(x) -> rsq(x)
12718
12719 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12720 // error seems really high at 2^29 ULP.
12721 // 1.0 / x -> rcp(x)
12722 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12723 }
12724
12725 // Same as for 1.0, but expand the sign out of the constant.
12726 if (CLHS->isExactlyValue(-1.0)) {
12727 // -1.0 / x -> rcp (fneg x)
12728 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12729 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12730 }
12731 }
12732
12733 // For f16 and bf16 require afn or arcp.
12734 // For f32 require afn.
12735 if (!AllowInaccurateRcp &&
12736 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12737 return SDValue();
12738
12739 // Turn into multiply by the reciprocal.
12740 // x / y -> x * (1.0 / y)
12741 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12742 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12743}
12744
12745SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12746 SelectionDAG &DAG) const {
12747 SDLoc SL(Op);
12748 SDValue X = Op.getOperand(0);
12749 SDValue Y = Op.getOperand(1);
12750 EVT VT = Op.getValueType();
12751 const SDNodeFlags Flags = Op->getFlags();
12752
12753 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12754 if (!AllowInaccurateDiv)
12755 return SDValue();
12756
12757 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12758 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12759
12760 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12761 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12762
12763 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12764 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12765 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12766 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12767 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12768 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12769}
12770
12771static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12772 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12773 SDNodeFlags Flags) {
12774 if (GlueChain->getNumValues() <= 1) {
12775 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12776 }
12777
12778 assert(GlueChain->getNumValues() == 3);
12779
12780 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12781 switch (Opcode) {
12782 default:
12783 llvm_unreachable("no chain equivalent for opcode");
12784 case ISD::FMUL:
12785 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12786 break;
12787 }
12788
12789 return DAG.getNode(Opcode, SL, VTList,
12790 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12791 Flags);
12792}
12793
12794static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12795 EVT VT, SDValue A, SDValue B, SDValue C,
12796 SDValue GlueChain, SDNodeFlags Flags) {
12797 if (GlueChain->getNumValues() <= 1) {
12798 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12799 }
12800
12801 assert(GlueChain->getNumValues() == 3);
12802
12803 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12804 switch (Opcode) {
12805 default:
12806 llvm_unreachable("no chain equivalent for opcode");
12807 case ISD::FMA:
12808 Opcode = AMDGPUISD::FMA_W_CHAIN;
12809 break;
12810 }
12811
12812 return DAG.getNode(Opcode, SL, VTList,
12813 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12814 Flags);
12815}
12816
12817SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12818 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12819 return FastLowered;
12820
12821 SDLoc SL(Op);
12822 EVT VT = Op.getValueType();
12823 SDValue LHS = Op.getOperand(0);
12824 SDValue RHS = Op.getOperand(1);
12825
12826 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12827 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12828
12829 if (VT == MVT::bf16) {
12830 SDValue ExtDiv =
12831 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12832 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12833 DAG.getTargetConstant(0, SL, MVT::i32));
12834 }
12835
12836 assert(VT == MVT::f16);
12837
12838 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12839 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12840 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12841 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12842 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12843 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12844 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12845 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12846 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12847 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12848 // q16.u = opx(V_CVT_F16_F32, q32.u);
12849 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12850
12851 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12852 unsigned FMADOpCode =
12854 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12855 SDValue Rcp =
12856 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12857 SDValue Quot =
12858 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12859 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12860 Op->getFlags());
12861 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12862 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12863 Op->getFlags());
12864 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12865 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12866 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12867 DAG.getConstant(0xff800000, SL, MVT::i32));
12868 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12869 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12870 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12871 DAG.getTargetConstant(0, SL, MVT::i32));
12872 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12873 Op->getFlags());
12874}
12875
12876// Faster 2.5 ULP division that does not support denormals.
12877SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12878 SDNodeFlags Flags = Op->getFlags();
12879 SDLoc SL(Op);
12880 SDValue LHS = Op.getOperand(1);
12881 SDValue RHS = Op.getOperand(2);
12882
12883 // TODO: The combiner should probably handle elimination of redundant fabs.
12885 ? RHS
12886 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12887
12888 const APFloat K0Val(0x1p+96f);
12889 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12890
12891 const APFloat K1Val(0x1p-32f);
12892 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12893
12894 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12895
12896 EVT SetCCVT =
12897 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12898
12899 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12900
12901 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12902
12903 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12904
12905 // rcp does not support denormals.
12906 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12907
12908 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12909
12910 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12911}
12912
12913// Returns immediate value for setting the F32 denorm mode when using the
12914// S_DENORM_MODE instruction.
12916 const SIMachineFunctionInfo *Info,
12917 const GCNSubtarget *ST) {
12918 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12919 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12920 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12921 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12922}
12923
12924SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12925 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12926 return FastLowered;
12927
12928 // The selection matcher assumes anything with a chain selecting to a
12929 // mayRaiseFPException machine instruction. Since we're introducing a chain
12930 // here, we need to explicitly report nofpexcept for the regular fdiv
12931 // lowering.
12932 SDNodeFlags Flags = Op->getFlags();
12933 Flags.setNoFPExcept(true);
12934
12935 SDLoc SL(Op);
12936 SDValue LHS = Op.getOperand(0);
12937 SDValue RHS = Op.getOperand(1);
12938
12939 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12940
12941 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12942
12943 SDValue DenominatorScaled =
12944 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12945 SDValue NumeratorScaled =
12946 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12947
12948 // Denominator is scaled to not be denormal, so using rcp is ok.
12949 SDValue ApproxRcp =
12950 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12951 SDValue NegDivScale0 =
12952 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12953
12954 using namespace AMDGPU::Hwreg;
12955 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12956 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12957
12958 const MachineFunction &MF = DAG.getMachineFunction();
12959 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12960 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12961
12962 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12963 const bool HasDynamicDenormals =
12964 (DenormMode.Input == DenormalMode::Dynamic) ||
12965 (DenormMode.Output == DenormalMode::Dynamic);
12966
12967 SDValue SavedDenormMode;
12968
12969 if (!PreservesDenormals) {
12970 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12971 // lowering. The chain dependence is insufficient, and we need glue. We do
12972 // not need the glue variants in a strictfp function.
12973
12974 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12975
12976 SDValue Glue = DAG.getEntryNode();
12977 if (HasDynamicDenormals) {
12978 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12979 DAG.getVTList(MVT::i32, MVT::Glue),
12980 {BitField, Glue});
12981 SavedDenormMode = SDValue(GetReg, 0);
12982
12983 Glue = DAG.getMergeValues(
12984 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12985 }
12986
12987 SDNode *EnableDenorm;
12988 if (Subtarget->hasDenormModeInst()) {
12989 const SDValue EnableDenormValue =
12990 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
12991
12992 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12993 EnableDenormValue)
12994 .getNode();
12995 } else {
12996 const SDValue EnableDenormValue =
12997 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12998 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12999 {EnableDenormValue, BitField, Glue});
13000 }
13001
13002 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
13003 SDValue(EnableDenorm, 1)};
13004
13005 NegDivScale0 = DAG.getMergeValues(Ops, SL);
13006 }
13007
13008 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
13009 ApproxRcp, One, NegDivScale0, Flags);
13010
13011 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
13012 ApproxRcp, Fma0, Flags);
13013
13014 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
13015 Fma1, Flags);
13016
13017 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
13018 NumeratorScaled, Mul, Flags);
13019
13020 SDValue Fma3 =
13021 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
13022
13023 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
13024 NumeratorScaled, Fma3, Flags);
13025
13026 if (!PreservesDenormals) {
13027 SDNode *DisableDenorm;
13028 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
13029 const SDValue DisableDenormValue = getSPDenormModeValue(
13030 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
13031
13032 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
13033 DisableDenorm =
13034 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
13035 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
13036 .getNode();
13037 } else {
13038 assert(HasDynamicDenormals == (bool)SavedDenormMode);
13039 const SDValue DisableDenormValue =
13040 HasDynamicDenormals
13041 ? SavedDenormMode
13042 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
13043
13044 DisableDenorm = DAG.getMachineNode(
13045 AMDGPU::S_SETREG_B32, SL, MVT::Other,
13046 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
13047 }
13048
13049 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
13050 SDValue(DisableDenorm, 0), DAG.getRoot());
13051 DAG.setRoot(OutputChain);
13052 }
13053
13054 SDValue Scale = NumeratorScaled.getValue(1);
13055 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
13056 {Fma4, Fma1, Fma3, Scale}, Flags);
13057
13058 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
13059}
13060
13061SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
13062 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
13063 return FastLowered;
13064
13065 SDLoc SL(Op);
13066 SDValue X = Op.getOperand(0);
13067 SDValue Y = Op.getOperand(1);
13068
13069 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
13070
13071 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
13072
13073 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
13074
13075 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
13076
13077 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
13078
13079 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
13080
13081 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
13082
13083 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
13084
13085 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
13086
13087 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
13088 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
13089
13090 SDValue Fma4 =
13091 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
13092
13093 SDValue Scale;
13094
13095 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
13096 // Workaround a hardware bug on SI where the condition output from div_scale
13097 // is not usable.
13098
13099 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
13100
13101 // Figure out if the scale to use for div_fmas.
13102 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
13103 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
13104 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
13105 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
13106
13107 SDValue NumHi =
13108 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
13109 SDValue DenHi =
13110 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
13111
13112 SDValue Scale0Hi =
13113 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
13114 SDValue Scale1Hi =
13115 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
13116
13117 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
13118 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
13119 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
13120 } else {
13121 Scale = DivScale1.getValue(1);
13122 }
13123
13124 SDValue Fmas =
13125 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
13126
13127 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
13128}
13129
13130SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
13131 EVT VT = Op.getValueType();
13132
13133 if (VT == MVT::f32)
13134 return LowerFDIV32(Op, DAG);
13135
13136 if (VT == MVT::f64)
13137 return LowerFDIV64(Op, DAG);
13138
13139 if (VT == MVT::f16 || VT == MVT::bf16)
13140 return LowerFDIV16(Op, DAG);
13141
13142 llvm_unreachable("Unexpected type for fdiv");
13143}
13144
13145SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
13146 SDLoc dl(Op);
13147 SDValue Val = Op.getOperand(0);
13148 EVT VT = Val.getValueType();
13149 EVT ResultExpVT = Op->getValueType(1);
13150 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
13151
13152 SDValue Mant = DAG.getNode(
13154 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
13155
13156 SDValue Exp = DAG.getNode(
13157 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
13158 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
13159
13160 if (Subtarget->hasFractBug()) {
13161 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
13162 SDValue Inf =
13164
13165 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
13166 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
13167 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
13168 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
13169 }
13170
13171 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
13172 return DAG.getMergeValues({Mant, CastExp}, dl);
13173}
13174
13175SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
13176 SDLoc DL(Op);
13177 StoreSDNode *Store = cast<StoreSDNode>(Op);
13178 EVT VT = Store->getMemoryVT();
13179
13180 if (VT == MVT::i1) {
13181 return DAG.getTruncStore(
13182 Store->getChain(), DL,
13183 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
13184 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
13185 }
13186
13187 assert(VT.isVector() &&
13188 Store->getValue().getValueType().getScalarType() == MVT::i32);
13189
13190 unsigned AS = Store->getAddressSpace();
13191 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13192 AS == AMDGPUAS::FLAT_ADDRESS &&
13193 Store->getAlign().value() < VT.getStoreSize() &&
13194 VT.getSizeInBits() > 32) {
13195 return SplitVectorStore(Op, DAG);
13196 }
13197
13198 MachineFunction &MF = DAG.getMachineFunction();
13199 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
13200 // If there is a possibility that flat instruction access scratch memory
13201 // then we need to use the same legalization rules we use for private.
13202 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13203 !Subtarget->hasMultiDwordFlatScratchAddressing())
13204 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
13207
13208 unsigned NumElements = VT.getVectorNumElements();
13210 if (NumElements > 4)
13211 return SplitVectorStore(Op, DAG);
13212 // v3 stores not supported on SI.
13213 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13214 return SplitVectorStore(Op, DAG);
13215
13217 VT, *Store->getMemOperand()))
13218 return expandUnalignedStore(Store, DAG);
13219
13220 return SDValue();
13221 }
13222 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13223 switch (Subtarget->getMaxPrivateElementSize()) {
13224 case 4:
13225 return scalarizeVectorStore(Store, DAG);
13226 case 8:
13227 if (NumElements > 2)
13228 return SplitVectorStore(Op, DAG);
13229 return SDValue();
13230 case 16:
13231 if (NumElements > 4 ||
13232 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13233 return SplitVectorStore(Op, DAG);
13234 return SDValue();
13235 default:
13236 llvm_unreachable("unsupported private_element_size");
13237 }
13238 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13239 unsigned Fast = 0;
13240 auto Flags = Store->getMemOperand()->getFlags();
13242 Store->getAlign(), Flags, &Fast) &&
13243 Fast > 1)
13244 return SDValue();
13245
13246 if (VT.isVector())
13247 return SplitVectorStore(Op, DAG);
13248
13249 return expandUnalignedStore(Store, DAG);
13250 }
13251
13252 // Probably an invalid store. If so we'll end up emitting a selection error.
13253 return SDValue();
13254}
13255
13256// Avoid the full correct expansion for f32 sqrt when promoting from f16.
13257SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
13258 SDLoc SL(Op);
13259 assert(!Subtarget->has16BitInsts());
13260 SDNodeFlags Flags = Op->getFlags();
13261 SDValue Ext =
13262 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
13263
13264 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
13265 SDValue Sqrt =
13266 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
13267
13268 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
13269 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
13270}
13271
13272SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13273 SDLoc DL(Op);
13274 SDNodeFlags Flags = Op->getFlags();
13275 MVT VT = Op.getValueType().getSimpleVT();
13276 const SDValue X = Op.getOperand(0);
13277
13278 if (allowApproxFunc(DAG, Flags)) {
13279 // Instruction is 1ulp but ignores denormals.
13280 return DAG.getNode(
13282 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
13283 }
13284
13285 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
13286 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
13287
13288 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
13289
13290 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
13291
13292 SDValue SqrtX =
13293 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
13294
13295 SDValue SqrtS;
13296 if (needsDenormHandlingF32(DAG, X, Flags)) {
13297 SDValue SqrtID =
13298 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
13299 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
13300
13301 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
13302 SDValue SqrtSNextDownInt =
13303 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13304 DAG.getAllOnesConstant(DL, MVT::i32));
13305 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
13306
13307 SDValue NegSqrtSNextDown =
13308 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
13309
13310 SDValue SqrtVP =
13311 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
13312
13313 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13314 DAG.getConstant(1, DL, MVT::i32));
13315 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
13316
13317 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
13318 SDValue SqrtVS =
13319 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
13320
13321 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
13322 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
13323
13324 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
13325 Flags);
13326
13327 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
13328 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
13329 Flags);
13330 } else {
13331 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
13332
13333 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
13334
13335 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
13336 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
13337 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
13338
13339 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
13340 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
13341 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
13342
13343 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
13344 SDValue SqrtD =
13345 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
13346 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
13347 }
13348
13349 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
13350
13351 SDValue ScaledDown =
13352 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
13353
13354 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
13355 SDValue IsZeroOrInf =
13356 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13357 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13358
13359 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
13360}
13361
13362SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13363 // For double type, the SQRT and RSQ instructions don't have required
13364 // precision, we apply Goldschmidt's algorithm to improve the result:
13365 //
13366 // y0 = rsq(x)
13367 // g0 = x * y0
13368 // h0 = 0.5 * y0
13369 //
13370 // r0 = 0.5 - h0 * g0
13371 // g1 = g0 * r0 + g0
13372 // h1 = h0 * r0 + h0
13373 //
13374 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
13375 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
13376 // h2 = h1 * r1 + h1
13377 //
13378 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
13379 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
13380 //
13381 // sqrt(x) = g3
13382
13383 SDNodeFlags Flags = Op->getFlags();
13384
13385 SDLoc DL(Op);
13386
13387 SDValue X = Op.getOperand(0);
13388 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
13389
13390 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
13391
13392 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
13393
13394 // Scale up input if it is too small.
13395 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
13396 SDValue ScaleUp =
13397 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
13398 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
13399
13400 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
13401
13402 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
13403
13404 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
13405 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
13406
13407 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
13408 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
13409
13410 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
13411
13412 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
13413
13414 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
13415 SDValue SqrtD0 =
13416 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
13417
13418 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
13419
13420 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
13421 SDValue SqrtD1 =
13422 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
13423
13424 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
13425
13426 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
13427 SDValue ScaleDown =
13428 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
13429 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
13430
13431 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
13432 // with finite only or nsz because rsq(+/-0) = +/-inf
13433
13434 // TODO: Check for DAZ and expand to subnormals
13435 SDValue IsZeroOrInf =
13436 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13437 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13438
13439 // If x is +INF, +0, or -0, use its original value
13440 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
13441 Flags);
13442}
13443
13444SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
13445 SDLoc DL(Op);
13446 EVT VT = Op.getValueType();
13447 SDValue Arg = Op.getOperand(0);
13448 SDValue TrigVal;
13449
13450 // Propagate fast-math flags so that the multiply we introduce can be folded
13451 // if Arg is already the result of a multiply by constant.
13452 auto Flags = Op->getFlags();
13453
13454 // AMDGPUISD nodes of vector type must be unrolled here since
13455 // they will not be expanded elsewhere.
13456 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
13457 if (!V.getValueType().isVector())
13458 return V;
13459
13460 return DAG.UnrollVectorOp(cast<SDNode>(V));
13461 };
13462
13463 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
13464
13465 if (Subtarget->hasTrigReducedRange()) {
13466 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13467 TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));
13468 } else {
13469 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13470 }
13471
13472 switch (Op.getOpcode()) {
13473 case ISD::FCOS:
13474 TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
13475 break;
13476 case ISD::FSIN:
13477 TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
13478 break;
13479 default:
13480 llvm_unreachable("Wrong trig opcode");
13481 }
13482
13483 return UnrollIfVec(TrigVal);
13484}
13485
13486SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13487 SelectionDAG &DAG) const {
13488 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
13489 assert(AtomicNode->isCompareAndSwap());
13490 unsigned AS = AtomicNode->getAddressSpace();
13491
13492 // No custom lowering required for local address space
13494 return Op;
13495
13496 // Non-local address space requires custom lowering for atomic compare
13497 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13498 SDLoc DL(Op);
13499 SDValue ChainIn = Op.getOperand(0);
13500 SDValue Addr = Op.getOperand(1);
13501 SDValue Old = Op.getOperand(2);
13502 SDValue New = Op.getOperand(3);
13503 EVT VT = Op.getValueType();
13504 MVT SimpleVT = VT.getSimpleVT();
13505 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
13506
13507 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
13508 SDValue Ops[] = {ChainIn, Addr, NewOld};
13509
13510 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
13511 Op->getVTList(), Ops, VT,
13512 AtomicNode->getMemOperand());
13513}
13514
13515//===----------------------------------------------------------------------===//
13516// Custom DAG optimizations
13517//===----------------------------------------------------------------------===//
13518
13519SDValue
13520SITargetLowering::performUCharToFloatCombine(SDNode *N,
13521 DAGCombinerInfo &DCI) const {
13522 EVT VT = N->getValueType(0);
13523 EVT ScalarVT = VT.getScalarType();
13524 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13525 return SDValue();
13526
13527 SelectionDAG &DAG = DCI.DAG;
13528 SDLoc DL(N);
13529
13530 SDValue Src = N->getOperand(0);
13531 EVT SrcVT = Src.getValueType();
13532
13533 // TODO: We could try to match extracting the higher bytes, which would be
13534 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13535 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13536 // about in practice.
13537 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13538 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
13539 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
13540 DCI.AddToWorklist(Cvt.getNode());
13541
13542 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13543 if (ScalarVT != MVT::f32) {
13544 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
13545 DAG.getTargetConstant(0, DL, MVT::i32));
13546 }
13547 return Cvt;
13548 }
13549 }
13550
13551 return SDValue();
13552}
13553
13554SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13555 DAGCombinerInfo &DCI) const {
13556 SDValue MagnitudeOp = N->getOperand(0);
13557 SDValue SignOp = N->getOperand(1);
13558
13559 // The generic combine for fcopysign + fp cast is too conservative with
13560 // vectors, and also gets confused by the splitting we will perform here, so
13561 // peek through FP casts.
13562 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13563 SignOp.getOpcode() == ISD::FP_ROUND)
13564 SignOp = SignOp.getOperand(0);
13565
13566 SelectionDAG &DAG = DCI.DAG;
13567 SDLoc DL(N);
13568 EVT SignVT = SignOp.getValueType();
13569
13570 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13571 // lower half with a copy.
13572 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13573 EVT MagVT = MagnitudeOp.getValueType();
13574
13575 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13576
13577 if (MagVT.getScalarType() == MVT::f64) {
13578 EVT F32VT = MagVT.isVector()
13579 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13580 : MVT::v2f32;
13581
13582 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13583
13585 for (unsigned I = 0; I != NumElts; ++I) {
13586 SDValue MagLo =
13587 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13588 DAG.getConstant(2 * I, DL, MVT::i32));
13589 SDValue MagHi =
13590 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13591 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13592
13593 SDValue SignOpElt =
13594 MagVT.isVector()
13596 SignOp, DAG.getConstant(I, DL, MVT::i32))
13597 : SignOp;
13598
13599 SDValue HiOp =
13600 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13601
13602 SDValue Vector =
13603 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13604
13605 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13606 NewElts.push_back(NewElt);
13607 }
13608
13609 if (NewElts.size() == 1)
13610 return NewElts[0];
13611
13612 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13613 }
13614
13615 if (SignVT.getScalarType() != MVT::f64)
13616 return SDValue();
13617
13618 // Reduce width of sign operand, we only need the highest bit.
13619 //
13620 // fcopysign f64:x, f64:y ->
13621 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13622 // TODO: In some cases it might make sense to go all the way to f16.
13623
13624 EVT F32VT = MagVT.isVector()
13625 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13626 : MVT::v2f32;
13627
13628 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13629
13630 SmallVector<SDValue, 8> F32Signs;
13631 for (unsigned I = 0; I != NumElts; ++I) {
13632 // Take sign from odd elements of cast vector
13633 SDValue SignAsF32 =
13634 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13635 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13636 F32Signs.push_back(SignAsF32);
13637 }
13638
13639 SDValue NewSign =
13640 NumElts == 1
13641 ? F32Signs.back()
13643 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13644 F32Signs);
13645
13646 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13647 NewSign);
13648}
13649
13650// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13651// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13652// bits
13653
13654// This is a variant of
13655// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13656//
13657// The normal DAG combiner will do this, but only if the add has one use since
13658// that would increase the number of instructions.
13659//
13660// This prevents us from seeing a constant offset that can be folded into a
13661// memory instruction's addressing mode. If we know the resulting add offset of
13662// a pointer can be folded into an addressing offset, we can replace the pointer
13663// operand with the add of new constant offset. This eliminates one of the uses,
13664// and may allow the remaining use to also be simplified.
13665//
13666SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13667 EVT MemVT,
13668 DAGCombinerInfo &DCI) const {
13669 SDValue N0 = N->getOperand(0);
13670 SDValue N1 = N->getOperand(1);
13671
13672 // We only do this to handle cases where it's profitable when there are
13673 // multiple uses of the add, so defer to the standard combine.
13674 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13675 return SDValue();
13676
13677 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13678 if (!CN1)
13679 return SDValue();
13680
13681 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13682 if (!CAdd)
13683 return SDValue();
13684
13685 SelectionDAG &DAG = DCI.DAG;
13686
13687 if (N0->getOpcode() == ISD::OR &&
13688 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13689 return SDValue();
13690
13691 // If the resulting offset is too large, we can't fold it into the
13692 // addressing mode offset.
13693 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13694 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13695
13696 AddrMode AM;
13697 AM.HasBaseReg = true;
13698 AM.BaseOffs = Offset.getSExtValue();
13699 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13700 return SDValue();
13701
13702 SDLoc SL(N);
13703 EVT VT = N->getValueType(0);
13704
13705 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13706 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13707
13708 SDNodeFlags Flags;
13709 Flags.setNoUnsignedWrap(
13710 N->getFlags().hasNoUnsignedWrap() &&
13711 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13712
13713 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13714 // be sure that the new left operand is a proper base pointer.
13715 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13716}
13717
13718/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13719/// by the chain and intrinsic ID. Theoretically we would also need to check the
13720/// specific intrinsic, but they all place the pointer operand first.
13721static unsigned getBasePtrIndex(const MemSDNode *N) {
13722 switch (N->getOpcode()) {
13723 case ISD::STORE:
13726 return 2;
13727 default:
13728 return 1;
13729 }
13730}
13731
13732SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13733 DAGCombinerInfo &DCI) const {
13734 SelectionDAG &DAG = DCI.DAG;
13735
13736 unsigned PtrIdx = getBasePtrIndex(N);
13737 SDValue Ptr = N->getOperand(PtrIdx);
13738
13739 // TODO: We could also do this for multiplies.
13740 if (Ptr.getOpcode() == ISD::SHL) {
13741 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13742 N->getMemoryVT(), DCI);
13743 if (NewPtr) {
13744 SmallVector<SDValue, 8> NewOps(N->ops());
13745
13746 NewOps[PtrIdx] = NewPtr;
13747 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13748 }
13749 }
13750
13751 return SDValue();
13752}
13753
13754static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13755 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13756 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13757 (Opc == ISD::XOR && Val == 0);
13758}
13759
13760// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13761// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13762// integer combine opportunities since most 64-bit operations are decomposed
13763// this way. TODO: We won't want this for SALU especially if it is an inline
13764// immediate.
13765SDValue SITargetLowering::splitBinaryBitConstantOp(
13766 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13767 const ConstantSDNode *CRHS) const {
13768 uint64_t Val = CRHS->getZExtValue();
13769 uint32_t ValLo = Lo_32(Val);
13770 uint32_t ValHi = Hi_32(Val);
13771 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13772
13773 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13775 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13776 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13777 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13778 !CRHS->user_begin()->isDivergent())
13779 return SDValue();
13780
13781 // If we need to materialize a 64-bit immediate, it will be split up later
13782 // anyway. Avoid creating the harder to understand 64-bit immediate
13783 // materialization.
13784 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13785 }
13786
13787 return SDValue();
13788}
13789
13791 if (V.getValueType() != MVT::i1)
13792 return false;
13793 switch (V.getOpcode()) {
13794 default:
13795 break;
13796 case ISD::SETCC:
13797 case ISD::IS_FPCLASS:
13798 case AMDGPUISD::FP_CLASS:
13799 return true;
13800 case ISD::AND:
13801 case ISD::OR:
13802 case ISD::XOR:
13803 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13804 case ISD::SADDO:
13805 case ISD::UADDO:
13806 case ISD::SSUBO:
13807 case ISD::USUBO:
13808 case ISD::SMULO:
13809 case ISD::UMULO:
13810 return V.getResNo() == 1;
13812 unsigned IntrinsicID = V.getConstantOperandVal(0);
13813 switch (IntrinsicID) {
13814 case Intrinsic::amdgcn_is_shared:
13815 case Intrinsic::amdgcn_is_private:
13816 return true;
13817 default:
13818 return false;
13819 }
13820
13821 return false;
13822 }
13823 }
13824 return false;
13825}
13826
13827// If a constant has all zeroes or all ones within each byte return it.
13828// Otherwise return 0.
13830 // 0xff for any zero byte in the mask
13831 uint32_t ZeroByteMask = 0;
13832 if (!(C & 0x000000ff))
13833 ZeroByteMask |= 0x000000ff;
13834 if (!(C & 0x0000ff00))
13835 ZeroByteMask |= 0x0000ff00;
13836 if (!(C & 0x00ff0000))
13837 ZeroByteMask |= 0x00ff0000;
13838 if (!(C & 0xff000000))
13839 ZeroByteMask |= 0xff000000;
13840 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13841 if ((NonZeroByteMask & C) != NonZeroByteMask)
13842 return 0; // Partial bytes selected.
13843 return C;
13844}
13845
13846// Check if a node selects whole bytes from its operand 0 starting at a byte
13847// boundary while masking the rest. Returns select mask as in the v_perm_b32
13848// or -1 if not succeeded.
13849// Note byte select encoding:
13850// value 0-3 selects corresponding source byte;
13851// value 0xc selects zero;
13852// value 0xff selects 0xff.
13854 assert(V.getValueSizeInBits() == 32);
13855
13856 if (V.getNumOperands() != 2)
13857 return ~0;
13858
13859 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13860 if (!N1)
13861 return ~0;
13862
13863 uint32_t C = N1->getZExtValue();
13864
13865 switch (V.getOpcode()) {
13866 default:
13867 break;
13868 case ISD::AND:
13869 if (uint32_t ConstMask = getConstantPermuteMask(C))
13870 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13871 break;
13872
13873 case ISD::OR:
13874 if (uint32_t ConstMask = getConstantPermuteMask(C))
13875 return (0x03020100 & ~ConstMask) | ConstMask;
13876 break;
13877
13878 case ISD::SHL:
13879 if (C % 8)
13880 return ~0;
13881
13882 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13883
13884 case ISD::SRL:
13885 if (C % 8)
13886 return ~0;
13887
13888 return uint32_t(0x0c0c0c0c03020100ull >> C);
13889 }
13890
13891 return ~0;
13892}
13893
13894SDValue SITargetLowering::performAndCombine(SDNode *N,
13895 DAGCombinerInfo &DCI) const {
13896 if (DCI.isBeforeLegalize())
13897 return SDValue();
13898
13899 SelectionDAG &DAG = DCI.DAG;
13900 EVT VT = N->getValueType(0);
13901 SDValue LHS = N->getOperand(0);
13902 SDValue RHS = N->getOperand(1);
13903
13904 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13905 if (VT == MVT::i64 && CRHS) {
13906 if (SDValue Split =
13907 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13908 return Split;
13909 }
13910
13911 if (CRHS && VT == MVT::i32) {
13912 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13913 // nb = number of trailing zeroes in mask
13914 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13915 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13916 uint64_t Mask = CRHS->getZExtValue();
13917 unsigned Bits = llvm::popcount(Mask);
13918 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13919 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13920 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13921 unsigned Shift = CShift->getZExtValue();
13922 unsigned NB = CRHS->getAPIntValue().countr_zero();
13923 unsigned Offset = NB + Shift;
13924 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13925 SDLoc SL(N);
13926 SDValue BFE =
13927 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13928 DAG.getConstant(Offset, SL, MVT::i32),
13929 DAG.getConstant(Bits, SL, MVT::i32));
13930 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13931 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13932 DAG.getValueType(NarrowVT));
13933 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13934 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13935 return Shl;
13936 }
13937 }
13938 }
13939
13940 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13941 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13942 isa<ConstantSDNode>(LHS.getOperand(2))) {
13943 uint32_t Sel = getConstantPermuteMask(Mask);
13944 if (!Sel)
13945 return SDValue();
13946
13947 // Select 0xc for all zero bytes
13948 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13949 SDLoc DL(N);
13950 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13951 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13952 }
13953 }
13954
13955 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13956 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13957 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13958 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13959 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13960
13961 SDValue X = LHS.getOperand(0);
13962 SDValue Y = RHS.getOperand(0);
13963 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13964 !isTypeLegal(X.getValueType()))
13965 return SDValue();
13966
13967 if (LCC == ISD::SETO) {
13968 if (X != LHS.getOperand(1))
13969 return SDValue();
13970
13971 if (RCC == ISD::SETUNE) {
13972 const ConstantFPSDNode *C1 =
13973 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13974 if (!C1 || !C1->isInfinity() || C1->isNegative())
13975 return SDValue();
13976
13977 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13981
13982 static_assert(
13985 0x3ff) == Mask,
13986 "mask not equal");
13987
13988 SDLoc DL(N);
13989 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13990 DAG.getConstant(Mask, DL, MVT::i32));
13991 }
13992 }
13993 }
13994
13995 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13996 std::swap(LHS, RHS);
13997
13998 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13999 RHS.hasOneUse()) {
14000 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
14001 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
14002 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
14003 // | n_nan)
14004 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14005 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
14006 (RHS.getOperand(0) == LHS.getOperand(0) &&
14007 LHS.getOperand(0) == LHS.getOperand(1))) {
14008 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
14009 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
14010 : Mask->getZExtValue() & OrdMask;
14011
14012 SDLoc DL(N);
14013 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
14014 DAG.getConstant(NewMask, DL, MVT::i32));
14015 }
14016 }
14017
14018 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
14019 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
14020 // and x, (sext cc from i1) => select cc, x, 0
14021 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
14022 std::swap(LHS, RHS);
14023 if (isBoolSGPR(RHS.getOperand(0)))
14024 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
14025 DAG.getConstant(0, SDLoc(N), MVT::i32));
14026 }
14027
14028 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14029 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14030 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14031 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14032 uint32_t LHSMask = getPermuteMask(LHS);
14033 uint32_t RHSMask = getPermuteMask(RHS);
14034 if (LHSMask != ~0u && RHSMask != ~0u) {
14035 // Canonicalize the expression in an attempt to have fewer unique masks
14036 // and therefore fewer registers used to hold the masks.
14037 if (LHSMask > RHSMask) {
14038 std::swap(LHSMask, RHSMask);
14039 std::swap(LHS, RHS);
14040 }
14041
14042 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14043 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14044 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14045 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14046
14047 // Check of we need to combine values from two sources within a byte.
14048 if (!(LHSUsedLanes & RHSUsedLanes) &&
14049 // If we select high and lower word keep it for SDWA.
14050 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14051 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14052 // Each byte in each mask is either selector mask 0-3, or has higher
14053 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
14054 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
14055 // mask which is not 0xff wins. By anding both masks we have a correct
14056 // result except that 0x0c shall be corrected to give 0x0c only.
14057 uint32_t Mask = LHSMask & RHSMask;
14058 for (unsigned I = 0; I < 32; I += 8) {
14059 uint32_t ByteSel = 0xff << I;
14060 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
14061 Mask &= (0x0c << I) & 0xffffffff;
14062 }
14063
14064 // Add 4 to each active LHS lane. It will not affect any existing 0xff
14065 // or 0x0c.
14066 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
14067 SDLoc DL(N);
14068
14069 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14070 RHS.getOperand(0),
14071 DAG.getConstant(Sel, DL, MVT::i32));
14072 }
14073 }
14074 }
14075
14076 return SDValue();
14077}
14078
14079// A key component of v_perm is a mapping between byte position of the src
14080// operands, and the byte position of the dest. To provide such, we need: 1. the
14081// node that provides x byte of the dest of the OR, and 2. the byte of the node
14082// used to provide that x byte. calculateByteProvider finds which node provides
14083// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
14084// and finds an ultimate src and byte position For example: The supported
14085// LoadCombine pattern for vector loads is as follows
14086// t1
14087// or
14088// / \
14089// t2 t3
14090// zext shl
14091// | | \
14092// t4 t5 16
14093// or anyext
14094// / \ |
14095// t6 t7 t8
14096// srl shl or
14097// / | / \ / \
14098// t9 t10 t11 t12 t13 t14
14099// trunc* 8 trunc* 8 and and
14100// | | / | | \
14101// t15 t16 t17 t18 t19 t20
14102// trunc* 255 srl -256
14103// | / \
14104// t15 t15 16
14105//
14106// *In this example, the truncs are from i32->i16
14107//
14108// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
14109// respectively. calculateSrcByte would find (given node) -> ultimate src &
14110// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
14111// After finding the mapping, we can combine the tree into vperm t15, t16,
14112// 0x05000407
14113
14114// Find the source and byte position from a node.
14115// \p DestByte is the byte position of the dest of the or that the src
14116// ultimately provides. \p SrcIndex is the byte of the src that maps to this
14117// dest of the or byte. \p Depth tracks how many recursive iterations we have
14118// performed.
14119static const std::optional<ByteProvider<SDValue>>
14120calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
14121 unsigned Depth = 0) {
14122 // We may need to recursively traverse a series of SRLs
14123 if (Depth >= 6)
14124 return std::nullopt;
14125
14126 if (Op.getValueSizeInBits() < 8)
14127 return std::nullopt;
14128
14129 if (Op.getValueType().isVector())
14130 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
14131
14132 switch (Op->getOpcode()) {
14133 case ISD::TRUNCATE: {
14134 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14135 }
14136
14137 case ISD::ANY_EXTEND:
14138 case ISD::SIGN_EXTEND:
14139 case ISD::ZERO_EXTEND:
14141 SDValue NarrowOp = Op->getOperand(0);
14142 auto NarrowVT = NarrowOp.getValueType();
14143 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
14144 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14145 NarrowVT = VTSign->getVT();
14146 }
14147 if (!NarrowVT.isByteSized())
14148 return std::nullopt;
14149 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
14150
14151 if (SrcIndex >= NarrowByteWidth)
14152 return std::nullopt;
14153 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14154 }
14155
14156 case ISD::SRA:
14157 case ISD::SRL: {
14158 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14159 if (!ShiftOp)
14160 return std::nullopt;
14161
14162 uint64_t BitShift = ShiftOp->getZExtValue();
14163
14164 if (BitShift % 8 != 0)
14165 return std::nullopt;
14166
14167 SrcIndex += BitShift / 8;
14168
14169 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14170 }
14171
14172 default: {
14173 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
14174 }
14175 }
14176 llvm_unreachable("fully handled switch");
14177}
14178
14179// For a byte position in the result of an Or, traverse the tree and find the
14180// node (and the byte of the node) which ultimately provides this {Or,
14181// BytePosition}. \p Op is the operand we are currently examining. \p Index is
14182// the byte position of the Op that corresponds with the originally requested
14183// byte of the Or \p Depth tracks how many recursive iterations we have
14184// performed. \p StartingIndex is the originally requested byte of the Or
14185static const std::optional<ByteProvider<SDValue>>
14186calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
14187 unsigned StartingIndex = 0) {
14188 // Finding Src tree of RHS of or typically requires at least 1 additional
14189 // depth
14190 if (Depth > 6)
14191 return std::nullopt;
14192
14193 unsigned BitWidth = Op.getScalarValueSizeInBits();
14194 if (BitWidth % 8 != 0)
14195 return std::nullopt;
14196 if (Index > BitWidth / 8 - 1)
14197 return std::nullopt;
14198
14199 bool IsVec = Op.getValueType().isVector();
14200 switch (Op.getOpcode()) {
14201 case ISD::OR: {
14202 if (IsVec)
14203 return std::nullopt;
14204
14205 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
14206 StartingIndex);
14207 if (!RHS)
14208 return std::nullopt;
14209 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14210 StartingIndex);
14211 if (!LHS)
14212 return std::nullopt;
14213 // A well formed Or will have two ByteProviders for each byte, one of which
14214 // is constant zero
14215 if (!LHS->isConstantZero() && !RHS->isConstantZero())
14216 return std::nullopt;
14217 if (!LHS || LHS->isConstantZero())
14218 return RHS;
14219 if (!RHS || RHS->isConstantZero())
14220 return LHS;
14221 return std::nullopt;
14222 }
14223
14224 case ISD::AND: {
14225 if (IsVec)
14226 return std::nullopt;
14227
14228 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14229 if (!BitMaskOp)
14230 return std::nullopt;
14231
14232 uint32_t BitMask = BitMaskOp->getZExtValue();
14233 // Bits we expect for our StartingIndex
14234 uint32_t IndexMask = 0xFF << (Index * 8);
14235
14236 if ((IndexMask & BitMask) != IndexMask) {
14237 // If the result of the and partially provides the byte, then it
14238 // is not well formatted
14239 if (IndexMask & BitMask)
14240 return std::nullopt;
14242 }
14243
14244 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
14245 }
14246
14247 case ISD::FSHR: {
14248 if (IsVec)
14249 return std::nullopt;
14250
14251 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
14252 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14253 if (!ShiftOp || Op.getValueType().isVector())
14254 return std::nullopt;
14255
14256 uint64_t BitsProvided = Op.getValueSizeInBits();
14257 if (BitsProvided % 8 != 0)
14258 return std::nullopt;
14259
14260 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14261 if (BitShift % 8)
14262 return std::nullopt;
14263
14264 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14265 uint64_t ByteShift = BitShift / 8;
14266
14267 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14268 uint64_t BytesProvided = BitsProvided / 8;
14269 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14270 NewIndex %= BytesProvided;
14271 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
14272 }
14273
14274 case ISD::SRA:
14275 case ISD::SRL: {
14276 if (IsVec)
14277 return std::nullopt;
14278
14279 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14280 if (!ShiftOp)
14281 return std::nullopt;
14282
14283 uint64_t BitShift = ShiftOp->getZExtValue();
14284 if (BitShift % 8)
14285 return std::nullopt;
14286
14287 auto BitsProvided = Op.getScalarValueSizeInBits();
14288 if (BitsProvided % 8 != 0)
14289 return std::nullopt;
14290
14291 uint64_t BytesProvided = BitsProvided / 8;
14292 uint64_t ByteShift = BitShift / 8;
14293 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
14294 // If the byte we are trying to provide (as tracked by index) falls in this
14295 // range, then the SRL provides the byte. The byte of interest of the src of
14296 // the SRL is Index + ByteShift
14297 return BytesProvided - ByteShift > Index
14298 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
14299 Index + ByteShift)
14301 }
14302
14303 case ISD::SHL: {
14304 if (IsVec)
14305 return std::nullopt;
14306
14307 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14308 if (!ShiftOp)
14309 return std::nullopt;
14310
14311 uint64_t BitShift = ShiftOp->getZExtValue();
14312 if (BitShift % 8 != 0)
14313 return std::nullopt;
14314 uint64_t ByteShift = BitShift / 8;
14315
14316 // If we are shifting by an amount greater than (or equal to)
14317 // the index we are trying to provide, then it provides 0s. If not,
14318 // then this bytes are not definitively 0s, and the corresponding byte
14319 // of interest is Index - ByteShift of the src
14320 return Index < ByteShift
14322 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
14323 Depth + 1, StartingIndex);
14324 }
14325 case ISD::ANY_EXTEND:
14326 case ISD::SIGN_EXTEND:
14327 case ISD::ZERO_EXTEND:
14329 case ISD::AssertZext:
14330 case ISD::AssertSext: {
14331 if (IsVec)
14332 return std::nullopt;
14333
14334 SDValue NarrowOp = Op->getOperand(0);
14335 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14336 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14337 Op->getOpcode() == ISD::AssertZext ||
14338 Op->getOpcode() == ISD::AssertSext) {
14339 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14340 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14341 }
14342 if (NarrowBitWidth % 8 != 0)
14343 return std::nullopt;
14344 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14345
14346 if (Index >= NarrowByteWidth)
14347 return Op.getOpcode() == ISD::ZERO_EXTEND
14348 ? std::optional<ByteProvider<SDValue>>(
14350 : std::nullopt;
14351 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
14352 }
14353
14354 case ISD::TRUNCATE: {
14355 if (IsVec)
14356 return std::nullopt;
14357
14358 uint64_t NarrowByteWidth = BitWidth / 8;
14359
14360 if (NarrowByteWidth >= Index) {
14361 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14362 StartingIndex);
14363 }
14364
14365 return std::nullopt;
14366 }
14367
14368 case ISD::CopyFromReg: {
14369 if (BitWidth / 8 > Index)
14370 return calculateSrcByte(Op, StartingIndex, Index);
14371
14372 return std::nullopt;
14373 }
14374
14375 case ISD::LOAD: {
14376 auto *L = cast<LoadSDNode>(Op.getNode());
14377
14378 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14379 if (NarrowBitWidth % 8 != 0)
14380 return std::nullopt;
14381 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14382
14383 // If the width of the load does not reach byte we are trying to provide for
14384 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
14385 // question
14386 if (Index >= NarrowByteWidth) {
14387 return L->getExtensionType() == ISD::ZEXTLOAD
14388 ? std::optional<ByteProvider<SDValue>>(
14390 : std::nullopt;
14391 }
14392
14393 if (NarrowByteWidth > Index) {
14394 return calculateSrcByte(Op, StartingIndex, Index);
14395 }
14396
14397 return std::nullopt;
14398 }
14399
14400 case ISD::BSWAP: {
14401 if (IsVec)
14402 return std::nullopt;
14403
14404 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
14405 Depth + 1, StartingIndex);
14406 }
14407
14409 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14410 if (!IdxOp)
14411 return std::nullopt;
14412 auto VecIdx = IdxOp->getZExtValue();
14413 auto ScalarSize = Op.getScalarValueSizeInBits();
14414 if (ScalarSize < 32)
14415 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14416 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
14417 StartingIndex, Index);
14418 }
14419
14420 case AMDGPUISD::PERM: {
14421 if (IsVec)
14422 return std::nullopt;
14423
14424 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14425 if (!PermMask)
14426 return std::nullopt;
14427
14428 auto IdxMask =
14429 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14430 if (IdxMask > 0x07 && IdxMask != 0x0c)
14431 return std::nullopt;
14432
14433 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14434 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14435
14436 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
14439 }
14440
14441 default: {
14442 return std::nullopt;
14443 }
14444 }
14445
14446 llvm_unreachable("fully handled switch");
14447}
14448
14449// Returns true if the Operand is a scalar and is 16 bits
14450static bool isExtendedFrom16Bits(SDValue &Operand) {
14451
14452 switch (Operand.getOpcode()) {
14453 case ISD::ANY_EXTEND:
14454 case ISD::SIGN_EXTEND:
14455 case ISD::ZERO_EXTEND: {
14456 auto OpVT = Operand.getOperand(0).getValueType();
14457 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
14458 }
14459 case ISD::LOAD: {
14460 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
14461 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
14462 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
14463 ExtType == ISD::EXTLOAD) {
14464 auto MemVT = L->getMemoryVT();
14465 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
14466 }
14467 return L->getMemoryVT().getSizeInBits() == 16;
14468 }
14469 default:
14470 return false;
14471 }
14472}
14473
14474// Returns true if the mask matches consecutive bytes, and the first byte
14475// begins at a power of 2 byte offset from 0th byte
14476static bool addresses16Bits(int Mask) {
14477 int Low8 = Mask & 0xff;
14478 int Hi8 = (Mask & 0xff00) >> 8;
14479
14480 assert(Low8 < 8 && Hi8 < 8);
14481 // Are the bytes contiguous in the order of increasing addresses.
14482 bool IsConsecutive = (Hi8 - Low8 == 1);
14483 // Is the first byte at location that is aligned for 16 bit instructions.
14484 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
14485 // In this case, we still need code to extract the 16 bit operand, so it
14486 // is better to use i8 v_perm
14487 bool Is16Aligned = !(Low8 % 2);
14488
14489 return IsConsecutive && Is16Aligned;
14490}
14491
14492// Do not lower into v_perm if the operands are actually 16 bit
14493// and the selected bits (based on PermMask) correspond with two
14494// easily addressable 16 bit operands.
14496 SDValue &OtherOp) {
14497 int Low16 = PermMask & 0xffff;
14498 int Hi16 = (PermMask & 0xffff0000) >> 16;
14499
14500 auto TempOp = peekThroughBitcasts(Op);
14501 auto TempOtherOp = peekThroughBitcasts(OtherOp);
14502
14503 auto OpIs16Bit =
14504 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
14505 if (!OpIs16Bit)
14506 return true;
14507
14508 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14509 isExtendedFrom16Bits(TempOtherOp);
14510 if (!OtherOpIs16Bit)
14511 return true;
14512
14513 // Do we cleanly address both
14514 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
14515}
14516
14518 unsigned DWordOffset) {
14519 SDValue Ret;
14520
14521 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14522 // ByteProvider must be at least 8 bits
14523 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14524
14525 if (TypeSize <= 32)
14526 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
14527
14528 if (Src.getValueType().isVector()) {
14529 auto ScalarTySize = Src.getScalarValueSizeInBits();
14530 auto ScalarTy = Src.getValueType().getScalarType();
14531 if (ScalarTySize == 32) {
14532 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
14533 DAG.getConstant(DWordOffset, SL, MVT::i32));
14534 }
14535 if (ScalarTySize > 32) {
14536 Ret = DAG.getNode(
14537 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
14538 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14539 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14540 if (ShiftVal)
14541 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
14542 DAG.getConstant(ShiftVal, SL, MVT::i32));
14543 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14544 }
14545
14546 assert(ScalarTySize < 32);
14547 auto NumElements = TypeSize / ScalarTySize;
14548 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14549 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14550 auto NumElementsIn32 = 32 / ScalarTySize;
14551 auto NumAvailElements = DWordOffset < Trunc32Elements
14552 ? NumElementsIn32
14553 : NumElements - NormalizedTrunc;
14554
14556 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
14557 NumAvailElements);
14558
14559 Ret = DAG.getBuildVector(
14560 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
14561 VecSrcs);
14562 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14563 }
14564
14565 /// Scalar Type
14566 auto ShiftVal = 32 * DWordOffset;
14567 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
14568 DAG.getConstant(ShiftVal, SL, MVT::i32));
14569 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14570}
14571
14573 SelectionDAG &DAG = DCI.DAG;
14574 [[maybe_unused]] EVT VT = N->getValueType(0);
14576
14577 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14578 assert(VT == MVT::i32);
14579 for (int i = 0; i < 4; i++) {
14580 // Find the ByteProvider that provides the ith byte of the result of OR
14581 std::optional<ByteProvider<SDValue>> P =
14582 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14583 // TODO support constantZero
14584 if (!P || P->isConstantZero())
14585 return SDValue();
14586
14587 PermNodes.push_back(*P);
14588 }
14589 if (PermNodes.size() != 4)
14590 return SDValue();
14591
14592 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14593 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14594 uint64_t PermMask = 0x00000000;
14595 for (size_t i = 0; i < PermNodes.size(); i++) {
14596 auto PermOp = PermNodes[i];
14597 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14598 // by sizeof(Src2) = 4
14599 int SrcByteAdjust = 4;
14600
14601 // If the Src uses a byte from a different DWORD, then it corresponds
14602 // with a difference source
14603 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14604 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14605 if (SecondSrc)
14606 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14607 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14608 return SDValue();
14609
14610 // Set the index of the second distinct Src node
14611 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14612 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14613 SrcByteAdjust = 0;
14614 }
14615 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14617 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14618 }
14619 SDLoc DL(N);
14620 SDValue Op = *PermNodes[FirstSrc.first].Src;
14621 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14622 assert(Op.getValueSizeInBits() == 32);
14623
14624 // Check that we are not just extracting the bytes in order from an op
14625 if (!SecondSrc) {
14626 int Low16 = PermMask & 0xffff;
14627 int Hi16 = (PermMask & 0xffff0000) >> 16;
14628
14629 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14630 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14631
14632 // The perm op would really just produce Op. So combine into Op
14633 if (WellFormedLow && WellFormedHi)
14634 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14635 }
14636
14637 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14638
14639 if (SecondSrc) {
14640 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14641 assert(OtherOp.getValueSizeInBits() == 32);
14642 }
14643
14644 // Check that we haven't just recreated the same FSHR node.
14645 if (N->getOpcode() == ISD::FSHR &&
14646 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14647 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14648 return SDValue();
14649
14650 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14651
14652 assert(Op.getValueType().isByteSized() &&
14653 OtherOp.getValueType().isByteSized());
14654
14655 // If the ultimate src is less than 32 bits, then we will only be
14656 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14657 // CalculateByteProvider would not have returned Op as source if we
14658 // used a byte that is outside its ValueType. Thus, we are free to
14659 // ANY_EXTEND as the extended bits are dont-cares.
14660 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14661 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14662
14663 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14664 DAG.getConstant(PermMask, DL, MVT::i32));
14665 }
14666 return SDValue();
14667}
14668
14669SDValue SITargetLowering::performOrCombine(SDNode *N,
14670 DAGCombinerInfo &DCI) const {
14671 SelectionDAG &DAG = DCI.DAG;
14672 SDValue LHS = N->getOperand(0);
14673 SDValue RHS = N->getOperand(1);
14674
14675 EVT VT = N->getValueType(0);
14676 if (VT == MVT::i1) {
14677 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14678 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14679 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14680 SDValue Src = LHS.getOperand(0);
14681 if (Src != RHS.getOperand(0))
14682 return SDValue();
14683
14684 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14685 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14686 if (!CLHS || !CRHS)
14687 return SDValue();
14688
14689 // Only 10 bits are used.
14690 static const uint32_t MaxMask = 0x3ff;
14691
14692 uint32_t NewMask =
14693 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14694 SDLoc DL(N);
14695 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14696 DAG.getConstant(NewMask, DL, MVT::i32));
14697 }
14698
14699 return SDValue();
14700 }
14701
14702 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14704 LHS.getOpcode() == AMDGPUISD::PERM &&
14705 isa<ConstantSDNode>(LHS.getOperand(2))) {
14706 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14707 if (!Sel)
14708 return SDValue();
14709
14710 Sel |= LHS.getConstantOperandVal(2);
14711 SDLoc DL(N);
14712 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14713 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14714 }
14715
14716 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14717 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14718 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14719 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14720
14721 // If all the uses of an or need to extract the individual elements, do not
14722 // attempt to lower into v_perm
14723 auto usesCombinedOperand = [](SDNode *OrUse) {
14724 // If we have any non-vectorized use, then it is a candidate for v_perm
14725 if (OrUse->getOpcode() != ISD::BITCAST ||
14726 !OrUse->getValueType(0).isVector())
14727 return true;
14728
14729 // If we have any non-vectorized use, then it is a candidate for v_perm
14730 for (auto *VUser : OrUse->users()) {
14731 if (!VUser->getValueType(0).isVector())
14732 return true;
14733
14734 // If the use of a vector is a store, then combining via a v_perm
14735 // is beneficial.
14736 // TODO -- whitelist more uses
14737 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14738 if (VUser->getOpcode() == VectorwiseOp)
14739 return true;
14740 }
14741 return false;
14742 };
14743
14744 if (!any_of(N->users(), usesCombinedOperand))
14745 return SDValue();
14746
14747 uint32_t LHSMask = getPermuteMask(LHS);
14748 uint32_t RHSMask = getPermuteMask(RHS);
14749
14750 if (LHSMask != ~0u && RHSMask != ~0u) {
14751 // Canonicalize the expression in an attempt to have fewer unique masks
14752 // and therefore fewer registers used to hold the masks.
14753 if (LHSMask > RHSMask) {
14754 std::swap(LHSMask, RHSMask);
14755 std::swap(LHS, RHS);
14756 }
14757
14758 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14759 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14760 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14761 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14762
14763 // Check of we need to combine values from two sources within a byte.
14764 if (!(LHSUsedLanes & RHSUsedLanes) &&
14765 // If we select high and lower word keep it for SDWA.
14766 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14767 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14768 // Kill zero bytes selected by other mask. Zero value is 0xc.
14769 LHSMask &= ~RHSUsedLanes;
14770 RHSMask &= ~LHSUsedLanes;
14771 // Add 4 to each active LHS lane
14772 LHSMask |= LHSUsedLanes & 0x04040404;
14773 // Combine masks
14774 uint32_t Sel = LHSMask | RHSMask;
14775 SDLoc DL(N);
14776
14777 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14778 RHS.getOperand(0),
14779 DAG.getConstant(Sel, DL, MVT::i32));
14780 }
14781 }
14782 if (LHSMask == ~0u || RHSMask == ~0u) {
14783 if (SDValue Perm = matchPERM(N, DCI))
14784 return Perm;
14785 }
14786 }
14787
14788 // Detect identity v2i32 OR and replace with identity source node.
14789 // Specifically an Or that has operands constructed from the same source node
14790 // via extract_vector_elt and build_vector. I.E.
14791 // v2i32 or(
14792 // v2i32 build_vector(
14793 // i32 extract_elt(%IdentitySrc, 0),
14794 // i32 0
14795 // ),
14796 // v2i32 build_vector(
14797 // i32 0,
14798 // i32 extract_elt(%IdentitySrc, 1)
14799 // ) )
14800 // =>
14801 // v2i32 %IdentitySrc
14802
14803 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14804 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14805
14806 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14807 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14808
14809 // Test for and normalise build vectors.
14810 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14811
14812 // Get the extract_vector_element operands.
14813 SDValue LEVE = LHS->getOperand(0);
14814 SDValue REVE = RHS->getOperand(1);
14815
14816 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14818 // Check that different elements from the same vector are
14819 // extracted.
14820 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14821 LEVE->getOperand(1) != REVE->getOperand(1)) {
14822 SDValue IdentitySrc = LEVE.getOperand(0);
14823 return IdentitySrc;
14824 }
14825 }
14826 }
14827 }
14828
14829 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14830 return SDValue();
14831
14832 // TODO: This could be a generic combine with a predicate for extracting the
14833 // high half of an integer being free.
14834
14835 // (or i64:x, (zero_extend i32:y)) ->
14836 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14837 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14838 RHS.getOpcode() != ISD::ZERO_EXTEND)
14839 std::swap(LHS, RHS);
14840
14841 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14842 SDValue ExtSrc = RHS.getOperand(0);
14843 EVT SrcVT = ExtSrc.getValueType();
14844 if (SrcVT == MVT::i32) {
14845 SDLoc SL(N);
14846 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14847 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14848
14849 DCI.AddToWorklist(LowOr.getNode());
14850 DCI.AddToWorklist(HiBits.getNode());
14851
14852 SDValue Vec =
14853 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14854 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14855 }
14856 }
14857
14858 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14859 if (CRHS) {
14860 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14861 N->getOperand(0), CRHS))
14862 return Split;
14863 }
14864
14865 return SDValue();
14866}
14867
14868SDValue SITargetLowering::performXorCombine(SDNode *N,
14869 DAGCombinerInfo &DCI) const {
14870 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14871 return RV;
14872
14873 SDValue LHS = N->getOperand(0);
14874 SDValue RHS = N->getOperand(1);
14875
14876 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14877 SelectionDAG &DAG = DCI.DAG;
14878
14879 EVT VT = N->getValueType(0);
14880 if (CRHS && VT == MVT::i64) {
14881 if (SDValue Split =
14882 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14883 return Split;
14884 }
14885
14886 // v2i32 (xor (vselect cc, x, y), K) ->
14887 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14888 // replaced with source modifiers when the select is lowered to CNDMASK.
14889 unsigned Opc = LHS.getOpcode();
14890 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14891 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14892 CRHS && CRHS->getAPIntValue().isSignMask()) {
14893 SDValue CC = LHS->getOperand(0);
14894 SDValue TRUE = LHS->getOperand(1);
14895 SDValue FALSE = LHS->getOperand(2);
14896 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14897 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14898 SDValue XSelect =
14899 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14900 return XSelect;
14901 }
14902
14903 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14904 // fneg-like xors into 64-bit select.
14905 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14906 // This looks like an fneg, try to fold as a source modifier.
14907 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14909 // xor (select c, a, b), 0x80000000 ->
14910 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14911 SDLoc DL(N);
14912 SDValue CastLHS =
14913 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14914 SDValue CastRHS =
14915 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14916 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14917 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14918 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14919 LHS->getOperand(0), FNegLHS, FNegRHS);
14920 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14921 }
14922 }
14923
14924 return SDValue();
14925}
14926
14927SDValue
14928SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
14929 DAGCombinerInfo &DCI) const {
14930 if (!Subtarget->has16BitInsts() ||
14931 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
14932 return SDValue();
14933
14934 EVT VT = N->getValueType(0);
14935 if (VT != MVT::i32)
14936 return SDValue();
14937
14938 SDValue Src = N->getOperand(0);
14939 if (Src.getValueType() != MVT::i16)
14940 return SDValue();
14941
14942 if (!Src->hasOneUse())
14943 return SDValue();
14944
14945 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
14946 // possible we're missing out on some combine opportunities, but we'd need to
14947 // weigh the cost of extracting the byte from the upper dwords.
14948
14949 std::optional<ByteProvider<SDValue>> BP0 =
14950 calculateByteProvider(SDValue(N, 0), 0, 0, 0);
14951 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14952 return SDValue();
14953 SDValue V0 = *BP0->Src;
14954
14955 std::optional<ByteProvider<SDValue>> BP1 =
14956 calculateByteProvider(SDValue(N, 0), 1, 0, 1);
14957 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14958 return SDValue();
14959
14960 SDValue V1 = *BP1->Src;
14961
14962 if (V0 == V1)
14963 return SDValue();
14964
14965 SelectionDAG &DAG = DCI.DAG;
14966 SDLoc DL(N);
14967 uint32_t PermMask = 0x0c0c0c0c;
14968 if (V0) {
14969 V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32);
14970 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14971 }
14972
14973 if (V1) {
14974 V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32);
14975 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14976 }
14977
14978 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1,
14979 DAG.getConstant(PermMask, DL, MVT::i32));
14980}
14981
14982SDValue
14983SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14984 DAGCombinerInfo &DCI) const {
14985 SDValue Src = N->getOperand(0);
14986 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14987
14988 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14989 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14990 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14991 VTSign->getVT() == MVT::i8) ||
14992 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14993 VTSign->getVT() == MVT::i16))) {
14994 assert(Subtarget->hasScalarSubwordLoads() &&
14995 "s_buffer_load_{u8, i8} are supported "
14996 "in GFX12 (or newer) architectures.");
14997 EVT VT = Src.getValueType();
14998 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14999 ? AMDGPUISD::SBUFFER_LOAD_BYTE
15000 : AMDGPUISD::SBUFFER_LOAD_SHORT;
15001 SDLoc DL(N);
15002 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
15003 SDValue Ops[] = {
15004 Src.getOperand(0), // source register
15005 Src.getOperand(1), // offset
15006 Src.getOperand(2) // cachePolicy
15007 };
15008 auto *M = cast<MemSDNode>(Src);
15009 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
15010 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
15011 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
15012 return LoadVal;
15013 }
15014 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
15015 VTSign->getVT() == MVT::i8) ||
15016 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
15017 VTSign->getVT() == MVT::i16)) &&
15018 Src.hasOneUse()) {
15019 auto *M = cast<MemSDNode>(Src);
15020 SDValue Ops[] = {Src.getOperand(0), // Chain
15021 Src.getOperand(1), // rsrc
15022 Src.getOperand(2), // vindex
15023 Src.getOperand(3), // voffset
15024 Src.getOperand(4), // soffset
15025 Src.getOperand(5), // offset
15026 Src.getOperand(6), Src.getOperand(7)};
15027 // replace with BUFFER_LOAD_BYTE/SHORT
15028 SDVTList ResList =
15029 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
15030 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
15031 ? AMDGPUISD::BUFFER_LOAD_BYTE
15032 : AMDGPUISD::BUFFER_LOAD_SHORT;
15033 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
15034 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
15035 return DCI.DAG.getMergeValues(
15036 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
15037 }
15038 return SDValue();
15039}
15040
15041SDValue SITargetLowering::performClassCombine(SDNode *N,
15042 DAGCombinerInfo &DCI) const {
15043 SelectionDAG &DAG = DCI.DAG;
15044 SDValue Mask = N->getOperand(1);
15045
15046 // fp_class x, 0 -> false
15047 if (isNullConstant(Mask))
15048 return DAG.getConstant(0, SDLoc(N), MVT::i1);
15049
15050 if (N->getOperand(0).isUndef())
15051 return DAG.getUNDEF(MVT::i1);
15052
15053 return SDValue();
15054}
15055
15056SDValue SITargetLowering::performRcpCombine(SDNode *N,
15057 DAGCombinerInfo &DCI) const {
15058 EVT VT = N->getValueType(0);
15059 SDValue N0 = N->getOperand(0);
15060
15061 if (N0.isUndef()) {
15062 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
15063 SDLoc(N), VT);
15064 }
15065
15066 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
15067 N0.getOpcode() == ISD::SINT_TO_FP)) {
15068 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
15069 N->getFlags());
15070 }
15071
15072 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
15073 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
15074 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
15075 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
15076 N->getFlags());
15077 }
15078
15080}
15081
15083 SDNodeFlags UserFlags,
15084 unsigned MaxDepth) const {
15085 unsigned Opcode = Op.getOpcode();
15086 if (Opcode == ISD::FCANONICALIZE)
15087 return true;
15088
15089 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
15090 const auto &F = CFP->getValueAPF();
15091 if (F.isNaN() && F.isSignaling())
15092 return false;
15093 if (!F.isDenormal())
15094 return true;
15095
15096 DenormalMode Mode =
15097 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
15098 return Mode == DenormalMode::getIEEE();
15099 }
15100
15101 // If source is a result of another standard FP operation it is already in
15102 // canonical form.
15103 if (MaxDepth == 0)
15104 return false;
15105
15106 switch (Opcode) {
15107 // These will flush denorms if required.
15108 case ISD::FADD:
15109 case ISD::FSUB:
15110 case ISD::FMUL:
15111 case ISD::FCEIL:
15112 case ISD::FFLOOR:
15113 case ISD::FMA:
15114 case ISD::FMAD:
15115 case ISD::FSQRT:
15116 case ISD::FDIV:
15117 case ISD::FREM:
15118 case ISD::FP_ROUND:
15119 case ISD::FP_EXTEND:
15120 case ISD::FP16_TO_FP:
15121 case ISD::FP_TO_FP16:
15122 case ISD::BF16_TO_FP:
15123 case ISD::FP_TO_BF16:
15124 case ISD::FLDEXP:
15125 case AMDGPUISD::FMUL_LEGACY:
15126 case AMDGPUISD::FMAD_FTZ:
15127 case AMDGPUISD::RCP:
15128 case AMDGPUISD::RSQ:
15129 case AMDGPUISD::RSQ_CLAMP:
15130 case AMDGPUISD::RCP_LEGACY:
15131 case AMDGPUISD::RCP_IFLAG:
15132 case AMDGPUISD::LOG:
15133 case AMDGPUISD::EXP:
15134 case AMDGPUISD::DIV_SCALE:
15135 case AMDGPUISD::DIV_FMAS:
15136 case AMDGPUISD::DIV_FIXUP:
15137 case AMDGPUISD::FRACT:
15138 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15139 case AMDGPUISD::CVT_F32_UBYTE0:
15140 case AMDGPUISD::CVT_F32_UBYTE1:
15141 case AMDGPUISD::CVT_F32_UBYTE2:
15142 case AMDGPUISD::CVT_F32_UBYTE3:
15143 case AMDGPUISD::FP_TO_FP16:
15144 case AMDGPUISD::SIN_HW:
15145 case AMDGPUISD::COS_HW:
15146 return true;
15147
15148 // It can/will be lowered or combined as a bit operation.
15149 // Need to check their input recursively to handle.
15150 case ISD::FNEG:
15151 case ISD::FABS:
15152 case ISD::FCOPYSIGN:
15153 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15154
15155 case ISD::AND:
15156 if (Op.getValueType() == MVT::i32) {
15157 // Be careful as we only know it is a bitcast floating point type. It
15158 // could be f32, v2f16, we have no way of knowing. Luckily the constant
15159 // value that we optimize for, which comes up in fp32 to bf16 conversions,
15160 // is valid to optimize for all types.
15161 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
15162 if (RHS->getZExtValue() == 0xffff0000) {
15163 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15164 }
15165 }
15166 }
15167 break;
15168
15169 case ISD::FSIN:
15170 case ISD::FCOS:
15171 case ISD::FSINCOS:
15172 return Op.getValueType().getScalarType() != MVT::f16;
15173
15174 case ISD::FMINNUM:
15175 case ISD::FMAXNUM:
15176 case ISD::FMINNUM_IEEE:
15177 case ISD::FMAXNUM_IEEE:
15178 case ISD::FMINIMUM:
15179 case ISD::FMAXIMUM:
15180 case ISD::FMINIMUMNUM:
15181 case ISD::FMAXIMUMNUM:
15182 case AMDGPUISD::CLAMP:
15183 case AMDGPUISD::FMED3:
15184 case AMDGPUISD::FMAX3:
15185 case AMDGPUISD::FMIN3:
15186 case AMDGPUISD::FMAXIMUM3:
15187 case AMDGPUISD::FMINIMUM3: {
15188 // FIXME: Shouldn't treat the generic operations different based these.
15189 // However, we aren't really required to flush the result from
15190 // minnum/maxnum..
15191
15192 // snans will be quieted, so we only need to worry about denormals.
15193 if (Subtarget->supportsMinMaxDenormModes() ||
15194 // FIXME: denormalsEnabledForType is broken for dynamic
15195 denormalsEnabledForType(DAG, Op.getValueType()))
15196 return true;
15197
15198 // Flushing may be required.
15199 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
15200 // targets need to check their input recursively.
15201
15202 // FIXME: Does this apply with clamp? It's implemented with max.
15203 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
15204 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
15205 return false;
15206 }
15207
15208 return true;
15209 }
15210 case ISD::SELECT: {
15211 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
15212 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
15213 }
15214 case ISD::BUILD_VECTOR: {
15215 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
15216 SDValue SrcOp = Op.getOperand(i);
15217 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
15218 return false;
15219 }
15220
15221 return true;
15222 }
15225 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15226 }
15228 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
15229 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
15230 }
15231 case ISD::UNDEF:
15232 // Could be anything.
15233 return false;
15234
15235 case ISD::BITCAST:
15236 // TODO: This is incorrect as it loses track of the operand's type. We may
15237 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
15238 // same bits that are canonicalized in one type need not be in the other.
15239 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15240 case ISD::TRUNCATE: {
15241 // Hack round the mess we make when legalizing extract_vector_elt
15242 if (Op.getValueType() == MVT::i16) {
15243 SDValue TruncSrc = Op.getOperand(0);
15244 if (TruncSrc.getValueType() == MVT::i32 &&
15245 TruncSrc.getOpcode() == ISD::BITCAST &&
15246 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
15247 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
15248 }
15249 }
15250 return false;
15251 }
15253 unsigned IntrinsicID = Op.getConstantOperandVal(0);
15254 // TODO: Handle more intrinsics
15255 switch (IntrinsicID) {
15256 case Intrinsic::amdgcn_cvt_pkrtz:
15257 case Intrinsic::amdgcn_cubeid:
15258 case Intrinsic::amdgcn_frexp_mant:
15259 case Intrinsic::amdgcn_fdot2:
15260 case Intrinsic::amdgcn_rcp:
15261 case Intrinsic::amdgcn_rsq:
15262 case Intrinsic::amdgcn_rsq_clamp:
15263 case Intrinsic::amdgcn_rcp_legacy:
15264 case Intrinsic::amdgcn_rsq_legacy:
15265 case Intrinsic::amdgcn_trig_preop:
15266 case Intrinsic::amdgcn_tanh:
15267 case Intrinsic::amdgcn_log:
15268 case Intrinsic::amdgcn_exp2:
15269 case Intrinsic::amdgcn_sqrt:
15270 return true;
15271 default:
15272 break;
15273 }
15274
15275 break;
15276 }
15277 default:
15278 break;
15279 }
15280
15281 // FIXME: denormalsEnabledForType is broken for dynamic
15282 return denormalsEnabledForType(DAG, Op.getValueType()) &&
15283 (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
15284}
15285
15287 unsigned MaxDepth) const {
15288 const MachineRegisterInfo &MRI = MF.getRegInfo();
15289 MachineInstr *MI = MRI.getVRegDef(Reg);
15290 unsigned Opcode = MI->getOpcode();
15291
15292 if (Opcode == AMDGPU::G_FCANONICALIZE)
15293 return true;
15294
15295 std::optional<FPValueAndVReg> FCR;
15296 // Constant splat (can be padded with undef) or scalar constant.
15297 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
15298 if (FCR->Value.isSignaling())
15299 return false;
15300 if (!FCR->Value.isDenormal())
15301 return true;
15302
15303 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
15304 return Mode == DenormalMode::getIEEE();
15305 }
15306
15307 if (MaxDepth == 0)
15308 return false;
15309
15310 switch (Opcode) {
15311 case AMDGPU::G_FADD:
15312 case AMDGPU::G_FSUB:
15313 case AMDGPU::G_FMUL:
15314 case AMDGPU::G_FCEIL:
15315 case AMDGPU::G_FFLOOR:
15316 case AMDGPU::G_FRINT:
15317 case AMDGPU::G_FNEARBYINT:
15318 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15319 case AMDGPU::G_INTRINSIC_TRUNC:
15320 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15321 case AMDGPU::G_FMA:
15322 case AMDGPU::G_FMAD:
15323 case AMDGPU::G_FSQRT:
15324 case AMDGPU::G_FDIV:
15325 case AMDGPU::G_FREM:
15326 case AMDGPU::G_FPOW:
15327 case AMDGPU::G_FPEXT:
15328 case AMDGPU::G_FLOG:
15329 case AMDGPU::G_FLOG2:
15330 case AMDGPU::G_FLOG10:
15331 case AMDGPU::G_FPTRUNC:
15332 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15333 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15334 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15335 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15336 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15337 return true;
15338 case AMDGPU::G_FNEG:
15339 case AMDGPU::G_FABS:
15340 case AMDGPU::G_FCOPYSIGN:
15341 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
15342 case AMDGPU::G_FMINNUM:
15343 case AMDGPU::G_FMAXNUM:
15344 case AMDGPU::G_FMINNUM_IEEE:
15345 case AMDGPU::G_FMAXNUM_IEEE:
15346 case AMDGPU::G_FMINIMUM:
15347 case AMDGPU::G_FMAXIMUM:
15348 case AMDGPU::G_FMINIMUMNUM:
15349 case AMDGPU::G_FMAXIMUMNUM: {
15350 if (Subtarget->supportsMinMaxDenormModes() ||
15351 // FIXME: denormalsEnabledForType is broken for dynamic
15352 denormalsEnabledForType(MRI.getType(Reg), MF))
15353 return true;
15354
15355 [[fallthrough]];
15356 }
15357 case AMDGPU::G_BUILD_VECTOR:
15358 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
15359 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
15360 return false;
15361 return true;
15362 case AMDGPU::G_INTRINSIC:
15363 case AMDGPU::G_INTRINSIC_CONVERGENT:
15364 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15365 case Intrinsic::amdgcn_fmul_legacy:
15366 case Intrinsic::amdgcn_fmad_ftz:
15367 case Intrinsic::amdgcn_sqrt:
15368 case Intrinsic::amdgcn_fmed3:
15369 case Intrinsic::amdgcn_sin:
15370 case Intrinsic::amdgcn_cos:
15371 case Intrinsic::amdgcn_log:
15372 case Intrinsic::amdgcn_exp2:
15373 case Intrinsic::amdgcn_log_clamp:
15374 case Intrinsic::amdgcn_rcp:
15375 case Intrinsic::amdgcn_rcp_legacy:
15376 case Intrinsic::amdgcn_rsq:
15377 case Intrinsic::amdgcn_rsq_clamp:
15378 case Intrinsic::amdgcn_rsq_legacy:
15379 case Intrinsic::amdgcn_div_scale:
15380 case Intrinsic::amdgcn_div_fmas:
15381 case Intrinsic::amdgcn_div_fixup:
15382 case Intrinsic::amdgcn_fract:
15383 case Intrinsic::amdgcn_cvt_pkrtz:
15384 case Intrinsic::amdgcn_cubeid:
15385 case Intrinsic::amdgcn_cubema:
15386 case Intrinsic::amdgcn_cubesc:
15387 case Intrinsic::amdgcn_cubetc:
15388 case Intrinsic::amdgcn_frexp_mant:
15389 case Intrinsic::amdgcn_fdot2:
15390 case Intrinsic::amdgcn_trig_preop:
15391 case Intrinsic::amdgcn_tanh:
15392 return true;
15393 default:
15394 break;
15395 }
15396
15397 [[fallthrough]];
15398 default:
15399 return false;
15400 }
15401
15402 llvm_unreachable("invalid operation");
15403}
15404
15405// Constant fold canonicalize.
15406SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
15407 const SDLoc &SL, EVT VT,
15408 const APFloat &C) const {
15409 // Flush denormals to 0 if not enabled.
15410 if (C.isDenormal()) {
15411 DenormalMode Mode =
15412 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
15413 if (Mode == DenormalMode::getPreserveSign()) {
15414 return DAG.getConstantFP(
15415 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
15416 }
15417
15418 if (Mode != DenormalMode::getIEEE())
15419 return SDValue();
15420 }
15421
15422 if (C.isNaN()) {
15423 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
15424 if (C.isSignaling()) {
15425 // Quiet a signaling NaN.
15426 // FIXME: Is this supposed to preserve payload bits?
15427 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15428 }
15429
15430 // Make sure it is the canonical NaN bitpattern.
15431 //
15432 // TODO: Can we use -1 as the canonical NaN value since it's an inline
15433 // immediate?
15434 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
15435 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15436 }
15437
15438 // Already canonical.
15439 return DAG.getConstantFP(C, SL, VT);
15440}
15441
15443 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
15444}
15445
15446SDValue
15447SITargetLowering::performFCanonicalizeCombine(SDNode *N,
15448 DAGCombinerInfo &DCI) const {
15449 SelectionDAG &DAG = DCI.DAG;
15450 SDValue N0 = N->getOperand(0);
15451 EVT VT = N->getValueType(0);
15452
15453 // fcanonicalize undef -> qnan
15454 if (N0.isUndef()) {
15456 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
15457 }
15458
15459 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
15460 EVT VT = N->getValueType(0);
15461 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
15462 }
15463
15464 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
15465 // (fcanonicalize k)
15466 //
15467 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
15468
15469 // TODO: This could be better with wider vectors that will be split to v2f16,
15470 // and to consider uses since there aren't that many packed operations.
15471 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
15472 isTypeLegal(MVT::v2f16)) {
15473 SDLoc SL(N);
15474 SDValue NewElts[2];
15475 SDValue Lo = N0.getOperand(0);
15476 SDValue Hi = N0.getOperand(1);
15477 EVT EltVT = Lo.getValueType();
15478
15480 for (unsigned I = 0; I != 2; ++I) {
15481 SDValue Op = N0.getOperand(I);
15482 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
15483 NewElts[I] =
15484 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15485 } else if (Op.isUndef()) {
15486 // Handled below based on what the other operand is.
15487 NewElts[I] = Op;
15488 } else {
15489 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
15490 }
15491 }
15492
15493 // If one half is undef, and one is constant, prefer a splat vector rather
15494 // than the normal qNaN. If it's a register, prefer 0.0 since that's
15495 // cheaper to use and may be free with a packed operation.
15496 if (NewElts[0].isUndef()) {
15497 if (isa<ConstantFPSDNode>(NewElts[1]))
15498 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
15499 ? NewElts[1]
15500 : DAG.getConstantFP(0.0f, SL, EltVT);
15501 }
15502
15503 if (NewElts[1].isUndef()) {
15504 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
15505 ? NewElts[0]
15506 : DAG.getConstantFP(0.0f, SL, EltVT);
15507 }
15508
15509 return DAG.getBuildVector(VT, SL, NewElts);
15510 }
15511 }
15512
15513 return SDValue();
15514}
15515
15516static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15517 switch (Opc) {
15518 case ISD::FMAXNUM:
15519 case ISD::FMAXNUM_IEEE:
15520 case ISD::FMAXIMUMNUM:
15521 return AMDGPUISD::FMAX3;
15522 case ISD::FMAXIMUM:
15523 return AMDGPUISD::FMAXIMUM3;
15524 case ISD::SMAX:
15525 return AMDGPUISD::SMAX3;
15526 case ISD::UMAX:
15527 return AMDGPUISD::UMAX3;
15528 case ISD::FMINNUM:
15529 case ISD::FMINNUM_IEEE:
15530 case ISD::FMINIMUMNUM:
15531 return AMDGPUISD::FMIN3;
15532 case ISD::FMINIMUM:
15533 return AMDGPUISD::FMINIMUM3;
15534 case ISD::SMIN:
15535 return AMDGPUISD::SMIN3;
15536 case ISD::UMIN:
15537 return AMDGPUISD::UMIN3;
15538 default:
15539 llvm_unreachable("Not a min/max opcode");
15540 }
15541}
15542
15543SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15544 const SDLoc &SL, SDValue Src,
15545 SDValue MinVal,
15546 SDValue MaxVal,
15547 bool Signed) const {
15548
15549 // med3 comes from
15550 // min(max(x, K0), K1), K0 < K1
15551 // max(min(x, K0), K1), K1 < K0
15552 //
15553 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15554 // min/max op.
15555 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
15556 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
15557
15558 if (!MinK || !MaxK)
15559 return SDValue();
15560
15561 if (Signed) {
15562 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
15563 return SDValue();
15564 } else {
15565 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
15566 return SDValue();
15567 }
15568
15569 EVT VT = MinK->getValueType(0);
15570 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15571 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15572 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15573
15574 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15575 // not available, but this is unlikely to be profitable as constants
15576 // will often need to be materialized & extended, especially on
15577 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15578 return SDValue();
15579}
15580
15583 return C;
15584
15586 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15587 return C;
15588 }
15589
15590 return nullptr;
15591}
15592
15593SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15594 const SDLoc &SL, SDValue Op0,
15595 SDValue Op1,
15596 bool IsKnownNoNaNs) const {
15597 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
15598 if (!K1)
15599 return SDValue();
15600
15601 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
15602 if (!K0)
15603 return SDValue();
15604
15605 // Ordered >= (although NaN inputs should have folded away by now).
15606 if (K0->getValueAPF() > K1->getValueAPF())
15607 return SDValue();
15608
15609 // med3 with a nan input acts like
15610 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15611 //
15612 // So the result depends on whether the IEEE mode bit is enabled or not with a
15613 // signaling nan input.
15614 // ieee=1
15615 // s0 snan: yields s2
15616 // s1 snan: yields s2
15617 // s2 snan: qnan
15618
15619 // s0 qnan: min(s1, s2)
15620 // s1 qnan: min(s0, s2)
15621 // s2 qnan: min(s0, s1)
15622
15623 // ieee=0
15624 // s0 snan: min(s1, s2)
15625 // s1 snan: min(s0, s2)
15626 // s2 snan: qnan
15627
15628 // s0 qnan: min(s1, s2)
15629 // s1 qnan: min(s0, s2)
15630 // s2 qnan: min(s0, s1)
15631 const MachineFunction &MF = DAG.getMachineFunction();
15632 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15633
15634 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15635 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15636 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15637 EVT VT = Op0.getValueType();
15638 if (Info->getMode().DX10Clamp) {
15639 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15640 // hardware fmed3 behavior converting to a min.
15641 // FIXME: Should this be allowing -0.0?
15642 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15643 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15644 }
15645
15646 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15647 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15648 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15649 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15650 // then give the other result, which is different from med3 with a NaN
15651 // input.
15652 SDValue Var = Op0.getOperand(0);
15653 if (!IsKnownNoNaNs && !DAG.isKnownNeverSNaN(Var))
15654 return SDValue();
15655
15656 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15657
15658 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15659 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15660 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15661 SDValue(K0, 0), SDValue(K1, 0));
15662 }
15663 }
15664
15665 return SDValue();
15666}
15667
15668/// \return true if the subtarget supports minimum3 and maximum3 with the given
15669/// base min/max opcode \p Opc for type \p VT.
15670static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15671 EVT VT) {
15672 switch (Opc) {
15673 case ISD::FMINNUM:
15674 case ISD::FMAXNUM:
15675 case ISD::FMINNUM_IEEE:
15676 case ISD::FMAXNUM_IEEE:
15677 case ISD::FMINIMUMNUM:
15678 case ISD::FMAXIMUMNUM:
15679 case AMDGPUISD::FMIN_LEGACY:
15680 case AMDGPUISD::FMAX_LEGACY:
15681 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15682 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15683 case ISD::FMINIMUM:
15684 case ISD::FMAXIMUM:
15685 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15686 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15687 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15688 case ISD::SMAX:
15689 case ISD::SMIN:
15690 case ISD::UMAX:
15691 case ISD::UMIN:
15692 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15693 default:
15694 return false;
15695 }
15696
15697 llvm_unreachable("not a min/max opcode");
15698}
15699
15700SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15701 DAGCombinerInfo &DCI) const {
15702 SelectionDAG &DAG = DCI.DAG;
15703
15704 EVT VT = N->getValueType(0);
15705 unsigned Opc = N->getOpcode();
15706 SDValue Op0 = N->getOperand(0);
15707 SDValue Op1 = N->getOperand(1);
15708
15709 // Only do this if the inner op has one use since this will just increases
15710 // register pressure for no benefit.
15711
15712 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15713 // max(max(a, b), c) -> max3(a, b, c)
15714 // min(min(a, b), c) -> min3(a, b, c)
15715 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15716 SDLoc DL(N);
15717 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15718 Op0.getOperand(0), Op0.getOperand(1), Op1);
15719 }
15720
15721 // Try commuted.
15722 // max(a, max(b, c)) -> max3(a, b, c)
15723 // min(a, min(b, c)) -> min3(a, b, c)
15724 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15725 SDLoc DL(N);
15726 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15727 Op0, Op1.getOperand(0), Op1.getOperand(1));
15728 }
15729 }
15730
15731 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15732 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15733 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15734 if (SDValue Med3 = performIntMed3ImmCombine(
15735 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15736 return Med3;
15737 }
15738 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15739 if (SDValue Med3 = performIntMed3ImmCombine(
15740 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15741 return Med3;
15742 }
15743
15744 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15745 if (SDValue Med3 = performIntMed3ImmCombine(
15746 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15747 return Med3;
15748 }
15749 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15750 if (SDValue Med3 = performIntMed3ImmCombine(
15751 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15752 return Med3;
15753 }
15754
15755 // if !is_snan(x):
15756 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15757 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15758 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15759 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15760 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15763 (Opc == AMDGPUISD::FMIN_LEGACY &&
15764 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15765 (VT == MVT::f32 || VT == MVT::f64 ||
15766 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15767 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15768 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15769 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15770 Op0.hasOneUse()) {
15771 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1,
15772 N->getFlags().hasNoNaNs()))
15773 return Res;
15774 }
15775
15776 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15777 // for some types, but at a higher cost since it's implemented with a 3
15778 // operand form.
15779 const SDNodeFlags Flags = N->getFlags();
15780 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
15781 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15783 unsigned NewOpc =
15785 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15786 }
15787
15788 return SDValue();
15789}
15790
15794 // FIXME: Should this be allowing -0.0?
15795 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15796 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15797 }
15798 }
15799
15800 return false;
15801}
15802
15803// FIXME: Should only worry about snans for version with chain.
15804SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15805 DAGCombinerInfo &DCI) const {
15806 EVT VT = N->getValueType(0);
15807 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15808 // NaNs. With a NaN input, the order of the operands may change the result.
15809
15810 SelectionDAG &DAG = DCI.DAG;
15811 SDLoc SL(N);
15812
15813 SDValue Src0 = N->getOperand(0);
15814 SDValue Src1 = N->getOperand(1);
15815 SDValue Src2 = N->getOperand(2);
15816
15817 if (isClampZeroToOne(Src0, Src1)) {
15818 // const_a, const_b, x -> clamp is safe in all cases including signaling
15819 // nans.
15820 // FIXME: Should this be allowing -0.0?
15821 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15822 }
15823
15824 const MachineFunction &MF = DAG.getMachineFunction();
15825 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15826
15827 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15828 // handling no dx10-clamp?
15829 if (Info->getMode().DX10Clamp) {
15830 // If NaNs is clamped to 0, we are free to reorder the inputs.
15831
15832 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15833 std::swap(Src0, Src1);
15834
15835 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15836 std::swap(Src1, Src2);
15837
15838 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15839 std::swap(Src0, Src1);
15840
15841 if (isClampZeroToOne(Src1, Src2))
15842 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15843 }
15844
15845 return SDValue();
15846}
15847
15848SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15849 DAGCombinerInfo &DCI) const {
15850 SDValue Src0 = N->getOperand(0);
15851 SDValue Src1 = N->getOperand(1);
15852 if (Src0.isUndef() && Src1.isUndef())
15853 return DCI.DAG.getUNDEF(N->getValueType(0));
15854 return SDValue();
15855}
15856
15857// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15858// expanded into a set of cmp/select instructions.
15860 unsigned NumElem,
15861 bool IsDivergentIdx,
15862 const GCNSubtarget *Subtarget) {
15864 return false;
15865
15866 unsigned VecSize = EltSize * NumElem;
15867
15868 // Sub-dword vectors of size 2 dword or less have better implementation.
15869 if (VecSize <= 64 && EltSize < 32)
15870 return false;
15871
15872 // Always expand the rest of sub-dword instructions, otherwise it will be
15873 // lowered via memory.
15874 if (EltSize < 32)
15875 return true;
15876
15877 // Always do this if var-idx is divergent, otherwise it will become a loop.
15878 if (IsDivergentIdx)
15879 return true;
15880
15881 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15882 unsigned NumInsts = NumElem /* Number of compares */ +
15883 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15884
15885 // On some architectures (GFX9) movrel is not available and it's better
15886 // to expand.
15887 if (Subtarget->useVGPRIndexMode())
15888 return NumInsts <= 16;
15889
15890 // If movrel is available, use it instead of expanding for vector of 8
15891 // elements.
15892 if (Subtarget->hasMovrel())
15893 return NumInsts <= 15;
15894
15895 return true;
15896}
15897
15899 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15900 if (isa<ConstantSDNode>(Idx))
15901 return false;
15902
15903 SDValue Vec = N->getOperand(0);
15904 EVT VecVT = Vec.getValueType();
15905 EVT EltVT = VecVT.getVectorElementType();
15906 unsigned EltSize = EltVT.getSizeInBits();
15907 unsigned NumElem = VecVT.getVectorNumElements();
15908
15910 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15911}
15912
15913SDValue
15914SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15915 DAGCombinerInfo &DCI) const {
15916 SDValue Vec = N->getOperand(0);
15917 SelectionDAG &DAG = DCI.DAG;
15918
15919 EVT VecVT = Vec.getValueType();
15920 EVT VecEltVT = VecVT.getVectorElementType();
15921 EVT ResVT = N->getValueType(0);
15922
15923 unsigned VecSize = VecVT.getSizeInBits();
15924 unsigned VecEltSize = VecEltVT.getSizeInBits();
15925
15926 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15928 SDLoc SL(N);
15929 SDValue Idx = N->getOperand(1);
15930 SDValue Elt =
15931 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15932 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15933 }
15934
15935 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15936 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15937 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15938 // depending on the shift operand. See e.g. performSraCombine().
15939 // This combine ensures that the optimisation is compatible with v2i32
15940 // legalised AND.
15941 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15942 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15943
15945 if (!C || C->getZExtValue() != 0x1f)
15946 return SDValue();
15947
15948 SDLoc SL(N);
15949 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15950 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15951 Vec->getOperand(0), N->getOperand(1));
15952 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15953 DAG.ReplaceAllUsesWith(N, A.getNode());
15954 }
15955
15956 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15957 // =>
15958 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15959 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15960 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15961 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15962 SDLoc SL(N);
15963 SDValue Idx = N->getOperand(1);
15964 unsigned Opc = Vec.getOpcode();
15965
15966 switch (Opc) {
15967 default:
15968 break;
15969 // TODO: Support other binary operations.
15970 case ISD::FADD:
15971 case ISD::FSUB:
15972 case ISD::FMUL:
15973 case ISD::ADD:
15974 case ISD::UMIN:
15975 case ISD::UMAX:
15976 case ISD::SMIN:
15977 case ISD::SMAX:
15978 case ISD::FMAXNUM:
15979 case ISD::FMINNUM:
15980 case ISD::FMAXNUM_IEEE:
15981 case ISD::FMINNUM_IEEE:
15982 case ISD::FMAXIMUM:
15983 case ISD::FMINIMUM: {
15984 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15985 Vec.getOperand(0), Idx);
15986 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15987 Vec.getOperand(1), Idx);
15988
15989 DCI.AddToWorklist(Elt0.getNode());
15990 DCI.AddToWorklist(Elt1.getNode());
15991 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15992 }
15993 }
15994 }
15995
15996 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15998 SDLoc SL(N);
15999 SDValue Idx = N->getOperand(1);
16000 SDValue V;
16001 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
16002 SDValue IC = DAG.getVectorIdxConstant(I, SL);
16003 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
16004 if (I == 0)
16005 V = Elt;
16006 else
16007 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
16008 }
16009 return V;
16010 }
16011
16012 // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
16013 // =>
16014 // i32:Lo(k) if Idx == 0, or
16015 // i32:Hi(k) if Idx == 1
16016 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
16017 if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
16018 SDLoc SL(N);
16019 SDValue PeekThrough = Vec.getOperand(0);
16020 auto *KImm = dyn_cast<ConstantSDNode>(PeekThrough);
16021 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
16022 uint64_t KImmValue = KImm->getZExtValue();
16023 return DAG.getConstant(
16024 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
16025 }
16026 auto *KFPImm = dyn_cast<ConstantFPSDNode>(PeekThrough);
16027 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
16028 uint64_t KFPImmValue =
16029 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
16030 return DAG.getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
16031 0xffffffff,
16032 SL, MVT::i32);
16033 }
16034 }
16035
16036 if (!DCI.isBeforeLegalize())
16037 return SDValue();
16038
16039 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
16040 // elements. This exposes more load reduction opportunities by replacing
16041 // multiple small extract_vector_elements with a single 32-bit extract.
16042 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
16043 VecSize > 32 && VecSize % 32 == 0 && Idx) {
16044 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
16045
16046 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
16047 unsigned EltIdx = BitIndex / 32;
16048 unsigned LeftoverBitIdx = BitIndex % 32;
16049 SDLoc SL(N);
16050
16051 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
16052 DCI.AddToWorklist(Cast.getNode());
16053
16054 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
16055 DAG.getConstant(EltIdx, SL, MVT::i32));
16056 DCI.AddToWorklist(Elt.getNode());
16057 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
16058 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
16059 DCI.AddToWorklist(Srl.getNode());
16060
16061 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
16062 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
16063 DCI.AddToWorklist(Trunc.getNode());
16064
16065 if (VecEltVT == ResVT) {
16066 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
16067 }
16068
16069 assert(ResVT.isScalarInteger());
16070 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
16071 }
16072
16073 return SDValue();
16074}
16075
16076SDValue
16077SITargetLowering::performInsertVectorEltCombine(SDNode *N,
16078 DAGCombinerInfo &DCI) const {
16079 SDValue Vec = N->getOperand(0);
16080 SDValue Idx = N->getOperand(2);
16081 EVT VecVT = Vec.getValueType();
16082 EVT EltVT = VecVT.getVectorElementType();
16083
16084 // INSERT_VECTOR_ELT (<n x e>, var-idx)
16085 // => BUILD_VECTOR n x select (e, const-idx)
16087 return SDValue();
16088
16089 SelectionDAG &DAG = DCI.DAG;
16090 SDLoc SL(N);
16091 SDValue Ins = N->getOperand(1);
16092 EVT IdxVT = Idx.getValueType();
16093
16095 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
16096 SDValue IC = DAG.getConstant(I, SL, IdxVT);
16097 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
16098 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
16099 Ops.push_back(V);
16100 }
16101
16102 return DAG.getBuildVector(VecVT, SL, Ops);
16103}
16104
16105/// Return the source of an fp_extend from f16 to f32, or a converted FP
16106/// constant.
16108 if (Src.getOpcode() == ISD::FP_EXTEND &&
16109 Src.getOperand(0).getValueType() == MVT::f16) {
16110 return Src.getOperand(0);
16111 }
16112
16113 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
16114 APFloat Val = CFP->getValueAPF();
16115 bool LosesInfo = true;
16117 if (!LosesInfo)
16118 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
16119 }
16120
16121 return SDValue();
16122}
16123
16124SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
16125 DAGCombinerInfo &DCI) const {
16126 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
16127 "combine only useful on gfx8");
16128
16129 SDValue TruncSrc = N->getOperand(0);
16130 EVT VT = N->getValueType(0);
16131 if (VT != MVT::f16)
16132 return SDValue();
16133
16134 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
16135 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
16136 return SDValue();
16137
16138 SelectionDAG &DAG = DCI.DAG;
16139 SDLoc SL(N);
16140
16141 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
16142 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
16143 // casting back.
16144
16145 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
16146 // fmin(fmax(a, b), fmax(fmin(a, b), c))
16147 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
16148 if (!A)
16149 return SDValue();
16150
16151 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
16152 if (!B)
16153 return SDValue();
16154
16155 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
16156 if (!C)
16157 return SDValue();
16158
16159 // This changes signaling nan behavior. If an input is a signaling nan, it
16160 // would have been quieted by the fpext originally. We don't care because
16161 // these are unconstrained ops. If we needed to insert quieting canonicalizes
16162 // we would be worse off than just doing the promotion.
16163 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
16164 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
16165 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
16166 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
16167}
16168
16169unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
16170 const SDNode *N0,
16171 const SDNode *N1) const {
16172 EVT VT = N0->getValueType(0);
16173
16174 // Only do this if we are not trying to support denormals. v_mad_f32 does not
16175 // support denormals ever.
16176 if (((VT == MVT::f32 &&
16178 (VT == MVT::f16 && Subtarget->hasMadF16() &&
16181 return ISD::FMAD;
16182
16183 const TargetOptions &Options = DAG.getTarget().Options;
16184 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
16185 (N0->getFlags().hasAllowContract() &&
16186 N1->getFlags().hasAllowContract())) &&
16188 return ISD::FMA;
16189 }
16190
16191 return 0;
16192}
16193
16194// For a reassociatable opcode perform:
16195// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
16196SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
16197 SelectionDAG &DAG) const {
16198 EVT VT = N->getValueType(0);
16199 if (VT != MVT::i32 && VT != MVT::i64)
16200 return SDValue();
16201
16202 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
16203 return SDValue();
16204
16205 unsigned Opc = N->getOpcode();
16206 SDValue Op0 = N->getOperand(0);
16207 SDValue Op1 = N->getOperand(1);
16208
16209 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
16210 return SDValue();
16211
16212 if (Op0->isDivergent())
16213 std::swap(Op0, Op1);
16214
16215 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
16216 return SDValue();
16217
16218 SDValue Op2 = Op1.getOperand(1);
16219 Op1 = Op1.getOperand(0);
16220 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
16221 return SDValue();
16222
16223 if (Op1->isDivergent())
16224 std::swap(Op1, Op2);
16225
16226 SDLoc SL(N);
16227 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
16228 return DAG.getNode(Opc, SL, VT, Add1, Op2);
16229}
16230
16231static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
16232 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
16234 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
16235 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
16236 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
16237}
16238
16239// Fold
16240// y = lshr i64 x, 32
16241// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
16242// with Const.hi == -1
16243// To
16244// res = mad_u64_u32 y.lo ,Const.lo, x.lo
16246 SDValue MulLHS, SDValue MulRHS,
16247 SDValue AddRHS) {
16248 if (MulRHS.getOpcode() == ISD::SRL)
16249 std::swap(MulLHS, MulRHS);
16250
16251 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
16252 return SDValue();
16253
16254 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
16255 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
16256 MulLHS.getOperand(0) != AddRHS)
16257 return SDValue();
16258
16260 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
16261 return SDValue();
16262
16263 SDValue ConstMul =
16264 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
16265 return getMad64_32(DAG, SL, MVT::i64,
16266 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
16267 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
16268}
16269
16270// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
16271// multiplies, if any.
16272//
16273// Full 64-bit multiplies that feed into an addition are lowered here instead
16274// of using the generic expansion. The generic expansion ends up with
16275// a tree of ADD nodes that prevents us from using the "add" part of the
16276// MAD instruction. The expansion produced here results in a chain of ADDs
16277// instead of a tree.
16278SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
16279 DAGCombinerInfo &DCI) const {
16280 assert(N->isAnyAdd());
16281
16282 SelectionDAG &DAG = DCI.DAG;
16283 EVT VT = N->getValueType(0);
16284 SDLoc SL(N);
16285 SDValue LHS = N->getOperand(0);
16286 SDValue RHS = N->getOperand(1);
16287
16288 if (VT.isVector())
16289 return SDValue();
16290
16291 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16292 // result in scalar registers for uniform values.
16293 if (!N->isDivergent() && Subtarget->hasSMulHi())
16294 return SDValue();
16295
16296 unsigned NumBits = VT.getScalarSizeInBits();
16297 if (NumBits <= 32 || NumBits > 64)
16298 return SDValue();
16299
16300 if (LHS.getOpcode() != ISD::MUL) {
16301 assert(RHS.getOpcode() == ISD::MUL);
16302 std::swap(LHS, RHS);
16303 }
16304
16305 // Avoid the fold if it would unduly increase the number of multiplies due to
16306 // multiple uses, except on hardware with full-rate multiply-add (which is
16307 // part of full-rate 64-bit ops).
16308 if (!Subtarget->hasFullRate64Ops()) {
16309 unsigned NumUsers = 0;
16310 for (SDNode *User : LHS->users()) {
16311 // There is a use that does not feed into addition, so the multiply can't
16312 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16313 if (!User->isAnyAdd())
16314 return SDValue();
16315
16316 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16317 // MUL + 3xADD + 3xADDC over 3xMAD.
16318 ++NumUsers;
16319 if (NumUsers >= 3)
16320 return SDValue();
16321 }
16322 }
16323
16324 SDValue MulLHS = LHS.getOperand(0);
16325 SDValue MulRHS = LHS.getOperand(1);
16326 SDValue AddRHS = RHS;
16327
16328 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16329 return FoldedMAD;
16330
16331 // Always check whether operands are small unsigned values, since that
16332 // knowledge is useful in more cases. Check for small signed values only if
16333 // doing so can unlock a shorter code sequence.
16334 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
16335 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
16336
16337 bool MulSignedLo = false;
16338 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16339 MulSignedLo =
16340 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
16341 }
16342
16343 // The operands and final result all have the same number of bits. If
16344 // operands need to be extended, they can be extended with garbage. The
16345 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
16346 // truncated away in the end.
16347 if (VT != MVT::i64) {
16348 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
16349 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
16350 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
16351 }
16352
16353 // The basic code generated is conceptually straightforward. Pseudo code:
16354 //
16355 // accum = mad_64_32 lhs.lo, rhs.lo, accum
16356 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
16357 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
16358 //
16359 // The second and third lines are optional, depending on whether the factors
16360 // are {sign,zero}-extended or not.
16361 //
16362 // The actual DAG is noisier than the pseudo code, but only due to
16363 // instructions that disassemble values into low and high parts, and
16364 // assemble the final result.
16365 SDValue One = DAG.getConstant(1, SL, MVT::i32);
16366
16367 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
16368 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
16369 SDValue Accum =
16370 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16371
16372 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16373 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16374
16375 if (!MulLHSUnsigned32) {
16376 auto MulLHSHi =
16377 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
16378 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
16379 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16380 }
16381
16382 if (!MulRHSUnsigned32) {
16383 auto MulRHSHi =
16384 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
16385 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
16386 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16387 }
16388
16389 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
16390 Accum = DAG.getBitcast(MVT::i64, Accum);
16391 }
16392
16393 if (VT != MVT::i64)
16394 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
16395 return Accum;
16396}
16397
16398SDValue
16399SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
16400 DAGCombinerInfo &DCI) const {
16401 SDValue RHS = N->getOperand(1);
16402 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16403 if (!CRHS)
16404 return SDValue();
16405
16406 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
16407 // common.
16408 uint64_t Val = CRHS->getZExtValue();
16409 if (countr_zero(Val) >= 32) {
16410 SelectionDAG &DAG = DCI.DAG;
16411 SDLoc SL(N);
16412 SDValue LHS = N->getOperand(0);
16413
16414 // Avoid carry machinery if we know the low half of the add does not
16415 // contribute to the final result.
16416 //
16417 // add i64:x, K if computeTrailingZeros(K) >= 32
16418 // => build_pair (add x.hi, K.hi), x.lo
16419
16420 // Breaking the 64-bit add here with this strange constant is unlikely
16421 // to interfere with addressing mode patterns.
16422
16423 SDValue Hi = getHiHalf64(LHS, DAG);
16424 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
16425 unsigned Opcode = N->getOpcode();
16426 if (Opcode == ISD::PTRADD)
16427 Opcode = ISD::ADD;
16428 SDValue AddHi =
16429 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
16430
16431 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
16432 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
16433 }
16434
16435 return SDValue();
16436}
16437
16438// Collect the ultimate src of each of the mul node's operands, and confirm
16439// each operand is 8 bytes.
16440static std::optional<ByteProvider<SDValue>>
16441handleMulOperand(const SDValue &MulOperand) {
16442 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
16443 if (!Byte0 || Byte0->isConstantZero()) {
16444 return std::nullopt;
16445 }
16446 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
16447 if (Byte1 && !Byte1->isConstantZero()) {
16448 return std::nullopt;
16449 }
16450 return Byte0;
16451}
16452
16453static unsigned addPermMasks(unsigned First, unsigned Second) {
16454 unsigned FirstCs = First & 0x0c0c0c0c;
16455 unsigned SecondCs = Second & 0x0c0c0c0c;
16456 unsigned FirstNoCs = First & ~0x0c0c0c0c;
16457 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16458
16459 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16460 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16461 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16462 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16463
16464 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16465}
16466
16467struct DotSrc {
16469 int64_t PermMask;
16471};
16472
16476 SmallVectorImpl<DotSrc> &Src1s, int Step) {
16477
16478 assert(Src0.Src.has_value() && Src1.Src.has_value());
16479 // Src0s and Src1s are empty, just place arbitrarily.
16480 if (Step == 0) {
16481 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
16482 Src0.SrcOffset / 4});
16483 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
16484 Src1.SrcOffset / 4});
16485 return;
16486 }
16487
16488 for (int BPI = 0; BPI < 2; BPI++) {
16489 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
16490 if (BPI == 1) {
16491 BPP = {Src1, Src0};
16492 }
16493 unsigned ZeroMask = 0x0c0c0c0c;
16494 unsigned FMask = 0xFF << (8 * (3 - Step));
16495
16496 unsigned FirstMask =
16497 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16498 unsigned SecondMask =
16499 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16500 // Attempt to find Src vector which contains our SDValue, if so, add our
16501 // perm mask to the existing one. If we are unable to find a match for the
16502 // first SDValue, attempt to find match for the second.
16503 int FirstGroup = -1;
16504 for (int I = 0; I < 2; I++) {
16505 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
16506 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
16507 return IterElt.SrcOp == *BPP.first.Src &&
16508 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16509 };
16510
16511 auto *Match = llvm::find_if(Srcs, MatchesFirst);
16512 if (Match != Srcs.end()) {
16513 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
16514 FirstGroup = I;
16515 break;
16516 }
16517 }
16518 if (FirstGroup != -1) {
16519 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
16520 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
16521 return IterElt.SrcOp == *BPP.second.Src &&
16522 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16523 };
16524 auto *Match = llvm::find_if(Srcs, MatchesSecond);
16525 if (Match != Srcs.end()) {
16526 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
16527 } else
16528 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16529 return;
16530 }
16531 }
16532
16533 // If we have made it here, then we could not find a match in Src0s or Src1s
16534 // for either Src0 or Src1, so just place them arbitrarily.
16535
16536 unsigned ZeroMask = 0x0c0c0c0c;
16537 unsigned FMask = 0xFF << (8 * (3 - Step));
16538
16539 Src0s.push_back(
16540 {*Src0.Src,
16541 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16542 Src0.SrcOffset / 4});
16543 Src1s.push_back(
16544 {*Src1.Src,
16545 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16546 Src1.SrcOffset / 4});
16547}
16548
16550 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
16551 bool IsAny) {
16552
16553 // If we just have one source, just permute it accordingly.
16554 if (Srcs.size() == 1) {
16555 auto *Elt = Srcs.begin();
16556 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
16557
16558 // v_perm will produce the original value
16559 if (Elt->PermMask == 0x3020100)
16560 return EltOp;
16561
16562 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16563 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
16564 }
16565
16566 auto *FirstElt = Srcs.begin();
16567 auto *SecondElt = std::next(FirstElt);
16568
16570
16571 // If we have multiple sources in the chain, combine them via perms (using
16572 // calculated perm mask) and Ors.
16573 while (true) {
16574 auto FirstMask = FirstElt->PermMask;
16575 auto SecondMask = SecondElt->PermMask;
16576
16577 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16578 unsigned FirstPlusFour = FirstMask | 0x04040404;
16579 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16580 // original 0x0C.
16581 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16582
16583 auto PermMask = addPermMasks(FirstMask, SecondMask);
16584 auto FirstVal =
16585 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16586 auto SecondVal =
16587 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
16588
16589 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
16590 SecondVal,
16591 DAG.getConstant(PermMask, SL, MVT::i32)));
16592
16593 FirstElt = std::next(SecondElt);
16594 if (FirstElt == Srcs.end())
16595 break;
16596
16597 SecondElt = std::next(FirstElt);
16598 // If we only have a FirstElt, then just combine that into the cumulative
16599 // source node.
16600 if (SecondElt == Srcs.end()) {
16601 auto EltOp =
16602 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16603
16604 Perms.push_back(
16605 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16606 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
16607 break;
16608 }
16609 }
16610
16611 assert(Perms.size() == 1 || Perms.size() == 2);
16612 return Perms.size() == 2
16613 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
16614 : Perms[0];
16615}
16616
16617static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16618 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16619 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16620 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16621 EntryMask += ZeroMask;
16622 }
16623}
16624
16625static bool isMul(const SDValue Op) {
16626 auto Opcode = Op.getOpcode();
16627
16628 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16629 Opcode == AMDGPUISD::MUL_I24);
16630}
16631
16632static std::optional<bool>
16634 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16635 const SDValue &S1Op, const SelectionDAG &DAG) {
16636 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16637 // of the dot4 is irrelevant.
16638 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16639 return false;
16640
16641 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16642 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16643 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16644 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16645 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16646 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16647
16648 assert(!(S0IsUnsigned && S0IsSigned));
16649 assert(!(S1IsUnsigned && S1IsSigned));
16650
16651 // There are 9 possible permutations of
16652 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16653
16654 // In two permutations, the sign bits are known to be the same for both Ops,
16655 // so simply return Signed / Unsigned corresponding to the MSB
16656
16657 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16658 return S0IsSigned;
16659
16660 // In another two permutations, the sign bits are known to be opposite. In
16661 // this case return std::nullopt to indicate a bad match.
16662
16663 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16664 return std::nullopt;
16665
16666 // In the remaining five permutations, we don't know the value of the sign
16667 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16668 // the upper bits must be extension bits. Thus, the only ways for the sign
16669 // bit to be unknown is if it was sign extended from unknown value, or if it
16670 // was any extended. In either case, it is correct to use the signed
16671 // version of the signedness semantics of dot4
16672
16673 // In two of such permutations, we known the sign bit is set for
16674 // one op, and the other is unknown. It is okay to used signed version of
16675 // dot4.
16676 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16677 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16678 return true;
16679
16680 // In one such permutation, we don't know either of the sign bits. It is okay
16681 // to used the signed version of dot4.
16682 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16683 return true;
16684
16685 // In two of such permutations, we known the sign bit is unset for
16686 // one op, and the other is unknown. Return std::nullopt to indicate a
16687 // bad match.
16688 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16689 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16690 return std::nullopt;
16691
16692 llvm_unreachable("Fully covered condition");
16693}
16694
16695SDValue SITargetLowering::performAddCombine(SDNode *N,
16696 DAGCombinerInfo &DCI) const {
16697 SelectionDAG &DAG = DCI.DAG;
16698 EVT VT = N->getValueType(0);
16699 SDLoc SL(N);
16700 SDValue LHS = N->getOperand(0);
16701 SDValue RHS = N->getOperand(1);
16702
16703 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16704 if (Subtarget->hasMad64_32()) {
16705 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16706 return Folded;
16707 }
16708 }
16709
16710 if (SDValue V = reassociateScalarOps(N, DAG)) {
16711 return V;
16712 }
16713
16714 if (VT == MVT::i64) {
16715 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16716 return Folded;
16717 }
16718
16719 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16720 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16721 SDValue TempNode(N, 0);
16722 std::optional<bool> IsSigned;
16726
16727 // Match the v_dot4 tree, while collecting src nodes.
16728 int ChainLength = 0;
16729 for (int I = 0; I < 4; I++) {
16730 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16731 if (MulIdx == -1)
16732 break;
16733 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16734 if (!Src0)
16735 break;
16736 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16737 if (!Src1)
16738 break;
16739
16740 auto IterIsSigned = checkDot4MulSignedness(
16741 TempNode->getOperand(MulIdx), *Src0, *Src1,
16742 TempNode->getOperand(MulIdx)->getOperand(0),
16743 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16744 if (!IterIsSigned)
16745 break;
16746 if (!IsSigned)
16747 IsSigned = *IterIsSigned;
16748 if (*IterIsSigned != *IsSigned)
16749 break;
16750 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16751 auto AddIdx = 1 - MulIdx;
16752 // Allow the special case where add (add (mul24, 0), mul24) became ->
16753 // add (mul24, mul24).
16754 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16755 Src2s.push_back(TempNode->getOperand(AddIdx));
16756 auto Src0 =
16757 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16758 if (!Src0)
16759 break;
16760 auto Src1 =
16761 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16762 if (!Src1)
16763 break;
16764 auto IterIsSigned = checkDot4MulSignedness(
16765 TempNode->getOperand(AddIdx), *Src0, *Src1,
16766 TempNode->getOperand(AddIdx)->getOperand(0),
16767 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16768 if (!IterIsSigned)
16769 break;
16770 assert(IsSigned);
16771 if (*IterIsSigned != *IsSigned)
16772 break;
16773 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16774 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16775 ChainLength = I + 2;
16776 break;
16777 }
16778
16779 TempNode = TempNode->getOperand(AddIdx);
16780 Src2s.push_back(TempNode);
16781 ChainLength = I + 1;
16782 if (TempNode->getNumOperands() < 2)
16783 break;
16784 LHS = TempNode->getOperand(0);
16785 RHS = TempNode->getOperand(1);
16786 }
16787
16788 if (ChainLength < 2)
16789 return SDValue();
16790
16791 // Masks were constructed with assumption that we would find a chain of
16792 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16793 // 0x0c) so they do not affect dot calculation.
16794 if (ChainLength < 4) {
16795 fixMasks(Src0s, ChainLength);
16796 fixMasks(Src1s, ChainLength);
16797 }
16798
16799 SDValue Src0, Src1;
16800
16801 // If we are just using a single source for both, and have permuted the
16802 // bytes consistently, we can just use the sources without permuting
16803 // (commutation).
16804 bool UseOriginalSrc = false;
16805 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16806 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16807 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16808 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16809 SmallVector<unsigned, 4> SrcBytes;
16810 auto Src0Mask = Src0s.begin()->PermMask;
16811 SrcBytes.push_back(Src0Mask & 0xFF000000);
16812 bool UniqueEntries = true;
16813 for (auto I = 1; I < 4; I++) {
16814 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16815
16816 if (is_contained(SrcBytes, NextByte)) {
16817 UniqueEntries = false;
16818 break;
16819 }
16820 SrcBytes.push_back(NextByte);
16821 }
16822
16823 if (UniqueEntries) {
16824 UseOriginalSrc = true;
16825
16826 auto *FirstElt = Src0s.begin();
16827 auto FirstEltOp =
16828 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16829
16830 auto *SecondElt = Src1s.begin();
16831 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16832 SecondElt->DWordOffset);
16833
16834 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16835 MVT::getIntegerVT(32));
16836 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16837 MVT::getIntegerVT(32));
16838 }
16839 }
16840
16841 if (!UseOriginalSrc) {
16842 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16843 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16844 }
16845
16846 assert(IsSigned);
16847 SDValue Src2 =
16848 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16849
16850 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16851 : Intrinsic::amdgcn_udot4,
16852 SL, MVT::i64);
16853
16854 assert(!VT.isVector());
16855 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16856 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16857
16858 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16859 }
16860
16861 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16862 return SDValue();
16863
16864 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16865 // add x, sext (setcc) => usubo_carry x, 0, setcc
16866 unsigned Opc = LHS.getOpcode();
16869 std::swap(RHS, LHS);
16870
16871 Opc = RHS.getOpcode();
16872 switch (Opc) {
16873 default:
16874 break;
16875 case ISD::ZERO_EXTEND:
16876 case ISD::SIGN_EXTEND:
16877 case ISD::ANY_EXTEND: {
16878 auto Cond = RHS.getOperand(0);
16879 // If this won't be a real VOPC output, we would still need to insert an
16880 // extra instruction anyway.
16881 if (!isBoolSGPR(Cond))
16882 break;
16883 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16884 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16886 return DAG.getNode(Opc, SL, VTList, Args);
16887 }
16888 case ISD::UADDO_CARRY: {
16889 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16890 if (!isNullConstant(RHS.getOperand(1)))
16891 break;
16892 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16893 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16894 }
16895 }
16896 return SDValue();
16897}
16898
16899SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16900 DAGCombinerInfo &DCI) const {
16901 SelectionDAG &DAG = DCI.DAG;
16902 SDLoc DL(N);
16903 EVT VT = N->getValueType(0);
16904 SDValue N0 = N->getOperand(0);
16905 SDValue N1 = N->getOperand(1);
16906
16907 // The following folds transform PTRADDs into regular arithmetic in cases
16908 // where the PTRADD wouldn't be folded as an immediate offset into memory
16909 // instructions anyway. They are target-specific in that other targets might
16910 // prefer to not lose information about the pointer arithmetic.
16911
16912 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16913 // Adapted from DAGCombiner::visitADDLikeCommutative.
16914 SDValue V, K;
16915 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16916 SDNodeFlags ShlFlags = N1->getFlags();
16917 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16918 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16919 // preserved.
16920 SDNodeFlags NewShlFlags =
16921 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16923 : SDNodeFlags();
16924 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16925 DCI.AddToWorklist(Inner.getNode());
16926 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16927 }
16928
16929 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16930 // performAddCombine.
16931 if (N1.getOpcode() == ISD::MUL) {
16932 if (Subtarget->hasMad64_32()) {
16933 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16934 return Folded;
16935 }
16936 }
16937
16938 // If the 32 low bits of the constant are all zero, there is nothing to fold
16939 // into an immediate offset, so it's better to eliminate the unnecessary
16940 // addition for the lower 32 bits than to preserve the PTRADD.
16941 // Analogous to a fold in performAddCombine.
16942 if (VT == MVT::i64) {
16943 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16944 return Folded;
16945 }
16946
16947 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16948 return SDValue();
16949
16950 SDValue X = N0;
16951 SDValue Y = N1.getOperand(0);
16952 SDValue Z = N1.getOperand(1);
16953 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16954 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16955
16956 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16957 Y->isDivergent() != Z->isDivergent()) {
16958 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16959 // y are uniform and z isn't.
16960 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16961 // z are uniform and y isn't.
16962 // The goal is to push uniform operands up in the computation, so that they
16963 // can be handled with scalar operations. We can't use reassociateScalarOps
16964 // for this since it requires two identical commutative operations to
16965 // reassociate.
16966 if (Y->isDivergent())
16967 std::swap(Y, Z);
16968 // If both additions in the original were NUW, reassociation preserves that.
16969 SDNodeFlags ReassocFlags =
16970 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16971 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16972 DCI.AddToWorklist(UniformInner.getNode());
16973 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16974 }
16975
16976 return SDValue();
16977}
16978
16979SDValue SITargetLowering::performSubCombine(SDNode *N,
16980 DAGCombinerInfo &DCI) const {
16981 SelectionDAG &DAG = DCI.DAG;
16982 EVT VT = N->getValueType(0);
16983
16984 if (VT == MVT::i64) {
16985 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16986 return Folded;
16987 }
16988
16989 if (VT != MVT::i32)
16990 return SDValue();
16991
16992 SDLoc SL(N);
16993 SDValue LHS = N->getOperand(0);
16994 SDValue RHS = N->getOperand(1);
16995
16996 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16997 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16998 unsigned Opc = RHS.getOpcode();
16999 switch (Opc) {
17000 default:
17001 break;
17002 case ISD::ZERO_EXTEND:
17003 case ISD::SIGN_EXTEND:
17004 case ISD::ANY_EXTEND: {
17005 auto Cond = RHS.getOperand(0);
17006 // If this won't be a real VOPC output, we would still need to insert an
17007 // extra instruction anyway.
17008 if (!isBoolSGPR(Cond))
17009 break;
17010 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
17011 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
17013 return DAG.getNode(Opc, SL, VTList, Args);
17014 }
17015 }
17016
17017 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
17018 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
17019 if (!isNullConstant(LHS.getOperand(1)))
17020 return SDValue();
17021 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
17022 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
17023 }
17024 return SDValue();
17025}
17026
17027SDValue
17028SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
17029 DAGCombinerInfo &DCI) const {
17030
17031 if (N->getValueType(0) != MVT::i32)
17032 return SDValue();
17033
17034 if (!isNullConstant(N->getOperand(1)))
17035 return SDValue();
17036
17037 SelectionDAG &DAG = DCI.DAG;
17038 SDValue LHS = N->getOperand(0);
17039
17040 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
17041 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
17042 unsigned LHSOpc = LHS.getOpcode();
17043 unsigned Opc = N->getOpcode();
17044 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
17045 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
17046 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
17047 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
17048 }
17049 return SDValue();
17050}
17051
17052SDValue SITargetLowering::performFAddCombine(SDNode *N,
17053 DAGCombinerInfo &DCI) const {
17054 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
17055 return SDValue();
17056
17057 SelectionDAG &DAG = DCI.DAG;
17058 EVT VT = N->getValueType(0);
17059
17060 SDLoc SL(N);
17061 SDValue LHS = N->getOperand(0);
17062 SDValue RHS = N->getOperand(1);
17063
17064 // These should really be instruction patterns, but writing patterns with
17065 // source modifiers is a pain.
17066
17067 // fadd (fadd (a, a), b) -> mad 2.0, a, b
17068 if (LHS.getOpcode() == ISD::FADD) {
17069 SDValue A = LHS.getOperand(0);
17070 if (A == LHS.getOperand(1)) {
17071 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
17072 if (FusedOp != 0) {
17073 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17074 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
17075 }
17076 }
17077 }
17078
17079 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
17080 if (RHS.getOpcode() == ISD::FADD) {
17081 SDValue A = RHS.getOperand(0);
17082 if (A == RHS.getOperand(1)) {
17083 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
17084 if (FusedOp != 0) {
17085 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17086 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
17087 }
17088 }
17089 }
17090
17091 return SDValue();
17092}
17093
17094SDValue SITargetLowering::performFSubCombine(SDNode *N,
17095 DAGCombinerInfo &DCI) const {
17096 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
17097 return SDValue();
17098
17099 SelectionDAG &DAG = DCI.DAG;
17100 SDLoc SL(N);
17101 EVT VT = N->getValueType(0);
17102 assert(!VT.isVector());
17103
17104 // Try to get the fneg to fold into the source modifier. This undoes generic
17105 // DAG combines and folds them into the mad.
17106 //
17107 // Only do this if we are not trying to support denormals. v_mad_f32 does
17108 // not support denormals ever.
17109 SDValue LHS = N->getOperand(0);
17110 SDValue RHS = N->getOperand(1);
17111 if (LHS.getOpcode() == ISD::FADD) {
17112 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
17113 SDValue A = LHS.getOperand(0);
17114 if (A == LHS.getOperand(1)) {
17115 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
17116 if (FusedOp != 0) {
17117 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17118 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
17119
17120 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
17121 }
17122 }
17123 }
17124
17125 if (RHS.getOpcode() == ISD::FADD) {
17126 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
17127
17128 SDValue A = RHS.getOperand(0);
17129 if (A == RHS.getOperand(1)) {
17130 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
17131 if (FusedOp != 0) {
17132 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
17133 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
17134 }
17135 }
17136 }
17137
17138 return SDValue();
17139}
17140
17141SDValue SITargetLowering::performFDivCombine(SDNode *N,
17142 DAGCombinerInfo &DCI) const {
17143 SelectionDAG &DAG = DCI.DAG;
17144 SDLoc SL(N);
17145 EVT VT = N->getValueType(0);
17146
17147 // fsqrt legality correlates to rsq availability.
17148 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
17149 return SDValue();
17150
17151 SDValue LHS = N->getOperand(0);
17152 SDValue RHS = N->getOperand(1);
17153
17154 SDNodeFlags Flags = N->getFlags();
17155 SDNodeFlags RHSFlags = RHS->getFlags();
17156 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
17157 !RHS->hasOneUse())
17158 return SDValue();
17159
17160 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
17161 bool IsNegative = false;
17162 if (CLHS->isExactlyValue(1.0) ||
17163 (IsNegative = CLHS->isExactlyValue(-1.0))) {
17164 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
17165 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
17166 if (RHS.getOpcode() == ISD::FSQRT) {
17167 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
17168 SDValue Rsq =
17169 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
17170 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
17171 }
17172 }
17173 }
17174
17175 return SDValue();
17176}
17177
17178SDValue SITargetLowering::performFMulCombine(SDNode *N,
17179 DAGCombinerInfo &DCI) const {
17180 SelectionDAG &DAG = DCI.DAG;
17181 EVT VT = N->getValueType(0);
17182 EVT ScalarVT = VT.getScalarType();
17183 EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
17184
17185 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
17186 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
17187 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
17188 return SDValue();
17189 }
17190
17191 SDValue LHS = N->getOperand(0);
17192 SDValue RHS = N->getOperand(1);
17193
17194 // It is cheaper to realize i32 inline constants as compared against
17195 // materializing f16 or f64 (or even non-inline f32) values,
17196 // possible via ldexp usage, as shown below :
17197 //
17198 // Given : A = 2^a & B = 2^b ; where a and b are integers.
17199 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
17200 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
17201 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17202 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
17203 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
17204 if (!TrueNode)
17205 return SDValue();
17206 const ConstantFPSDNode *FalseNode =
17207 isConstOrConstSplatFP(RHS.getOperand(2));
17208 if (!FalseNode)
17209 return SDValue();
17210
17211 if (TrueNode->isNegative() != FalseNode->isNegative())
17212 return SDValue();
17213
17214 // For f32, only non-inline constants should be transformed.
17215 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17216 if (ScalarVT == MVT::f32 &&
17217 TII->isInlineConstant(TrueNode->getValueAPF()) &&
17218 TII->isInlineConstant(FalseNode->getValueAPF()))
17219 return SDValue();
17220
17221 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
17222 if (TrueNodeExpVal == INT_MIN)
17223 return SDValue();
17224 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
17225 if (FalseNodeExpVal == INT_MIN)
17226 return SDValue();
17227
17228 SDLoc SL(N);
17229 SDValue SelectNode =
17230 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
17231 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
17232 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
17233
17234 LHS = TrueNode->isNegative()
17235 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
17236 : LHS;
17237
17238 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
17239 }
17240
17241 return SDValue();
17242}
17243
17244SDValue SITargetLowering::performFMACombine(SDNode *N,
17245 DAGCombinerInfo &DCI) const {
17246 SelectionDAG &DAG = DCI.DAG;
17247 EVT VT = N->getValueType(0);
17248 SDLoc SL(N);
17249
17250 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17251 return SDValue();
17252
17253 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
17254 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
17255 SDValue Op1 = N->getOperand(0);
17256 SDValue Op2 = N->getOperand(1);
17257 SDValue FMA = N->getOperand(2);
17258
17259 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
17260 Op2.getOpcode() != ISD::FP_EXTEND)
17261 return SDValue();
17262
17263 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
17264 // regardless of the denorm mode setting. Therefore,
17265 // fp-contract is sufficient to allow generating fdot2.
17266 const TargetOptions &Options = DAG.getTarget().Options;
17267 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17268 (N->getFlags().hasAllowContract() &&
17269 FMA->getFlags().hasAllowContract())) {
17270 Op1 = Op1.getOperand(0);
17271 Op2 = Op2.getOperand(0);
17272 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17274 return SDValue();
17275
17276 SDValue Vec1 = Op1.getOperand(0);
17277 SDValue Idx1 = Op1.getOperand(1);
17278 SDValue Vec2 = Op2.getOperand(0);
17279
17280 SDValue FMAOp1 = FMA.getOperand(0);
17281 SDValue FMAOp2 = FMA.getOperand(1);
17282 SDValue FMAAcc = FMA.getOperand(2);
17283
17284 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
17285 FMAOp2.getOpcode() != ISD::FP_EXTEND)
17286 return SDValue();
17287
17288 FMAOp1 = FMAOp1.getOperand(0);
17289 FMAOp2 = FMAOp2.getOperand(0);
17290 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17292 return SDValue();
17293
17294 SDValue Vec3 = FMAOp1.getOperand(0);
17295 SDValue Vec4 = FMAOp2.getOperand(0);
17296 SDValue Idx2 = FMAOp1.getOperand(1);
17297
17298 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
17299 // Idx1 and Idx2 cannot be the same.
17300 Idx1 == Idx2)
17301 return SDValue();
17302
17303 if (Vec1 == Vec2 || Vec3 == Vec4)
17304 return SDValue();
17305
17306 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
17307 return SDValue();
17308
17309 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17310 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
17311 DAG.getTargetConstant(0, SL, MVT::i1));
17312 }
17313 }
17314 return SDValue();
17315}
17316
17317SDValue SITargetLowering::performSetCCCombine(SDNode *N,
17318 DAGCombinerInfo &DCI) const {
17319 SelectionDAG &DAG = DCI.DAG;
17320 SDLoc SL(N);
17321
17322 SDValue LHS = N->getOperand(0);
17323 SDValue RHS = N->getOperand(1);
17324 EVT VT = LHS.getValueType();
17325 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
17326
17327 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
17328 if (!CRHS) {
17330 if (CRHS) {
17331 std::swap(LHS, RHS);
17332 CC = getSetCCSwappedOperands(CC);
17333 }
17334 }
17335
17336 if (CRHS) {
17337 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
17338 isBoolSGPR(LHS.getOperand(0))) {
17339 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
17340 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
17341 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
17342 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
17343 if ((CRHS->isAllOnes() &&
17344 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
17345 (CRHS->isZero() &&
17346 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
17347 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
17348 DAG.getAllOnesConstant(SL, MVT::i1));
17349 if ((CRHS->isAllOnes() &&
17350 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
17351 (CRHS->isZero() &&
17352 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
17353 return LHS.getOperand(0);
17354 }
17355
17356 const APInt &CRHSVal = CRHS->getAPIntValue();
17357 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
17358 LHS.getOpcode() == ISD::SELECT &&
17359 isa<ConstantSDNode>(LHS.getOperand(1)) &&
17360 isa<ConstantSDNode>(LHS.getOperand(2)) &&
17361 isBoolSGPR(LHS.getOperand(0))) {
17362 // Given CT != FT:
17363 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
17364 // setcc (select cc, CT, CF), CF, ne => cc
17365 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
17366 // setcc (select cc, CT, CF), CT, eq => cc
17367 const APInt &CT = LHS.getConstantOperandAPInt(1);
17368 const APInt &CF = LHS.getConstantOperandAPInt(2);
17369
17370 if (CT != CF) {
17371 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
17372 (CT == CRHSVal && CC == ISD::SETNE))
17373 return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);
17374 if ((CF == CRHSVal && CC == ISD::SETNE) ||
17375 (CT == CRHSVal && CC == ISD::SETEQ))
17376 return LHS.getOperand(0);
17377 }
17378 }
17379 }
17380
17381 // Truncate 64-bit setcc to test only upper 32-bits of its operands in the
17382 // following cases where information about the lower 32-bits of its operands
17383 // is known:
17384 //
17385 // If LHS.lo32 == RHS.lo32:
17386 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
17387 // If LHS.lo32 != RHS.lo32:
17388 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
17389 // If LHS.lo32 >= RHS.lo32 (unsigned):
17390 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
17391 // If LHS.lo32 > RHS.lo32 (unsigned):
17392 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
17393 // If LHS.lo32 <= RHS.lo32 (unsigned):
17394 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
17395 // If LHS.lo32 < RHS.lo32 (unsigned):
17396 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
17397 if (VT == MVT::i64) {
17398 const KnownBits LHSKnownLo32 = DAG.computeKnownBits(LHS).trunc(32);
17399 const KnownBits RHSKnownLo32 = DAG.computeKnownBits(RHS).trunc(32);
17400
17401 // NewCC is valid iff we can truncate the setcc to only test the upper 32
17402 // bits
17404
17405 switch (CC) {
17406 default:
17407 break;
17408 case ISD::SETEQ: {
17409 const std::optional<bool> KnownEq =
17410 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
17411 if (KnownEq)
17412 NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;
17413
17414 break;
17415 }
17416 case ISD::SETNE: {
17417 const std::optional<bool> KnownEq =
17418 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
17419 if (KnownEq)
17420 NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;
17421
17422 break;
17423 }
17424 case ISD::SETULT:
17425 case ISD::SETUGE:
17426 case ISD::SETLT:
17427 case ISD::SETGE: {
17428 const std::optional<bool> KnownUge =
17429 KnownBits::uge(LHSKnownLo32, RHSKnownLo32);
17430 if (KnownUge) {
17431 if (*KnownUge) {
17432 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
17433 NewCC = CC;
17434 } else {
17435 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
17436 NewCC = CC == ISD::SETULT ? ISD::SETULE
17437 : CC == ISD::SETUGE ? ISD::SETUGT
17438 : CC == ISD::SETLT ? ISD::SETLE
17439 : ISD::SETGT;
17440 }
17441 }
17442 break;
17443 }
17444 case ISD::SETULE:
17445 case ISD::SETUGT:
17446 case ISD::SETLE:
17447 case ISD::SETGT: {
17448 const std::optional<bool> KnownUle =
17449 KnownBits::ule(LHSKnownLo32, RHSKnownLo32);
17450 if (KnownUle) {
17451 if (*KnownUle) {
17452 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
17453 NewCC = CC;
17454 } else {
17455 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
17456 NewCC = CC == ISD::SETULE ? ISD::SETULT
17457 : CC == ISD::SETUGT ? ISD::SETUGE
17458 : CC == ISD::SETLE ? ISD::SETLT
17459 : ISD::SETGE;
17460 }
17461 }
17462 break;
17463 }
17464 }
17465
17466 if (NewCC != ISD::SETCC_INVALID)
17467 return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
17468 getHiHalf64(RHS, DAG), NewCC);
17469 }
17470
17471 // Eliminate setcc by using carryout from add/sub instruction
17472
17473 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
17474 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
17475 // similarly for subtraction
17476
17477 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
17478 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
17479
17480 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
17482 (CC == ISD::SETUGT &&
17484 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
17485 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
17486 bool IsAdd = LHS.getOpcode() == ISD::ADD;
17487
17488 SDValue Op0 = LHS.getOperand(0);
17489 SDValue Op1 = LHS.getOperand(1);
17490
17491 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
17492 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
17493
17494 SDValue Op0Hi = getHiHalf64(Op0, DAG);
17495 SDValue Op1Hi = getHiHalf64(Op1, DAG);
17496
17497 SDValue NodeLo =
17498 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
17499 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
17500
17501 SDValue CarryInHi = NodeLo.getValue(1);
17502 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
17503 SL, DAG.getVTList(MVT::i32, MVT::i1),
17504 {Op0Hi, Op1Hi, CarryInHi});
17505
17506 SDValue ResultLo = NodeLo.getValue(0);
17507 SDValue ResultHi = NodeHi.getValue(0);
17508
17509 SDValue JoinedResult =
17510 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
17511
17512 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
17513 SDValue Overflow = NodeHi.getValue(1);
17514 DCI.CombineTo(LHS.getNode(), Result);
17515 return Overflow;
17516 }
17517
17518 if (VT != MVT::f32 && VT != MVT::f64 &&
17519 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17520 return SDValue();
17521
17522 // Match isinf/isfinite pattern
17523 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
17524 // (fcmp one (fabs x), inf) -> (fp_class x,
17525 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
17526 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
17527 LHS.getOpcode() == ISD::FABS) {
17528 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
17529 if (!CRHS)
17530 return SDValue();
17531
17532 const APFloat &APF = CRHS->getValueAPF();
17533 if (APF.isInfinity() && !APF.isNegative()) {
17534 const unsigned IsInfMask =
17536 const unsigned IsFiniteMask =
17540 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
17541 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
17542 DAG.getConstant(Mask, SL, MVT::i32));
17543 }
17544 }
17545
17546 return SDValue();
17547}
17548
17549SDValue
17550SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
17551 DAGCombinerInfo &DCI) const {
17552 SelectionDAG &DAG = DCI.DAG;
17553 SDLoc SL(N);
17554 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17555
17556 SDValue Src = N->getOperand(0);
17557 SDValue Shift = N->getOperand(0);
17558
17559 // TODO: Extend type shouldn't matter (assuming legal types).
17560 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
17561 Shift = Shift.getOperand(0);
17562
17563 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
17564 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
17565 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
17566 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
17567 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
17568 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
17569 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
17570 SDValue Shifted = DAG.getZExtOrTrunc(
17571 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
17572
17573 unsigned ShiftOffset = 8 * Offset;
17574 if (Shift.getOpcode() == ISD::SHL)
17575 ShiftOffset -= C->getZExtValue();
17576 else
17577 ShiftOffset += C->getZExtValue();
17578
17579 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17580 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
17581 MVT::f32, Shifted);
17582 }
17583 }
17584 }
17585
17586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17587 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
17588 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
17589 // We simplified Src. If this node is not dead, visit it again so it is
17590 // folded properly.
17591 if (N->getOpcode() != ISD::DELETED_NODE)
17592 DCI.AddToWorklist(N);
17593 return SDValue(N, 0);
17594 }
17595
17596 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
17597 if (SDValue DemandedSrc =
17598 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
17599 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
17600
17601 return SDValue();
17602}
17603
17604SDValue SITargetLowering::performClampCombine(SDNode *N,
17605 DAGCombinerInfo &DCI) const {
17606 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
17607 if (!CSrc)
17608 return SDValue();
17609
17610 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17611 const APFloat &F = CSrc->getValueAPF();
17612 APFloat Zero = APFloat::getZero(F.getSemantics());
17613 if (F < Zero ||
17614 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17615 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
17616 }
17617
17618 APFloat One(F.getSemantics(), "1.0");
17619 if (F > One)
17620 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
17621
17622 return SDValue(CSrc, 0);
17623}
17624
17625SDValue SITargetLowering::performSelectCombine(SDNode *N,
17626 DAGCombinerInfo &DCI) const {
17627
17628 // Try to fold CMP + SELECT patterns with shared constants (both FP and
17629 // integer).
17630 // Detect when CMP and SELECT use the same constant and fold them to avoid
17631 // loading the constant twice. Specifically handles patterns like:
17632 // %cmp = icmp eq i32 %val, 4242
17633 // %sel = select i1 %cmp, i32 4242, i32 %other
17634 // It can be optimized to reuse %val instead of 4242 in select.
17635 SDValue Cond = N->getOperand(0);
17636 SDValue TrueVal = N->getOperand(1);
17637 SDValue FalseVal = N->getOperand(2);
17638
17639 // Check if condition is a comparison.
17640 if (Cond.getOpcode() != ISD::SETCC)
17641 return SDValue();
17642
17643 SDValue LHS = Cond.getOperand(0);
17644 SDValue RHS = Cond.getOperand(1);
17645 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
17646
17647 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17648 bool isInteger = LHS.getValueType().isInteger();
17649
17650 // Handle simple floating-point and integer types only.
17651 if (!isFloatingPoint && !isInteger)
17652 return SDValue();
17653
17654 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17655 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17656 if (!isEquality && !isNonEquality)
17657 return SDValue();
17658
17659 SDValue ArgVal, ConstVal;
17660 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
17661 (isInteger && isa<ConstantSDNode>(RHS))) {
17662 ConstVal = RHS;
17663 ArgVal = LHS;
17664 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
17665 (isInteger && isa<ConstantSDNode>(LHS))) {
17666 ConstVal = LHS;
17667 ArgVal = RHS;
17668 } else {
17669 return SDValue();
17670 }
17671
17672 // Skip optimization for inlinable immediates.
17673 if (isFloatingPoint) {
17674 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
17675 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17676 return SDValue();
17677 } else {
17679 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
17680 return SDValue();
17681 }
17682
17683 // For equality and non-equality comparisons, patterns:
17684 // select (setcc x, const), const, y -> select (setcc x, const), x, y
17685 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17686 if (!(isEquality && TrueVal == ConstVal) &&
17687 !(isNonEquality && FalseVal == ConstVal))
17688 return SDValue();
17689
17690 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17691 SDValue SelectRHS =
17692 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17693 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
17694 SelectLHS, SelectRHS);
17695}
17696
17698 DAGCombinerInfo &DCI) const {
17699 switch (N->getOpcode()) {
17700 case ISD::ADD:
17701 case ISD::SUB:
17702 case ISD::SHL:
17703 case ISD::SRL:
17704 case ISD::SRA:
17705 case ISD::AND:
17706 case ISD::OR:
17707 case ISD::XOR:
17708 case ISD::MUL:
17709 case ISD::SETCC:
17710 case ISD::SELECT:
17711 case ISD::SMIN:
17712 case ISD::SMAX:
17713 case ISD::UMIN:
17714 case ISD::UMAX:
17715 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
17716 return Res;
17717 break;
17718 default:
17719 break;
17720 }
17721
17722 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17723 return SDValue();
17724
17725 switch (N->getOpcode()) {
17726 case ISD::ADD:
17727 return performAddCombine(N, DCI);
17728 case ISD::PTRADD:
17729 return performPtrAddCombine(N, DCI);
17730 case ISD::SUB:
17731 return performSubCombine(N, DCI);
17732 case ISD::UADDO_CARRY:
17733 case ISD::USUBO_CARRY:
17734 return performAddCarrySubCarryCombine(N, DCI);
17735 case ISD::FADD:
17736 return performFAddCombine(N, DCI);
17737 case ISD::FSUB:
17738 return performFSubCombine(N, DCI);
17739 case ISD::FDIV:
17740 return performFDivCombine(N, DCI);
17741 case ISD::FMUL:
17742 return performFMulCombine(N, DCI);
17743 case ISD::SETCC:
17744 return performSetCCCombine(N, DCI);
17745 case ISD::SELECT:
17746 if (auto Res = performSelectCombine(N, DCI))
17747 return Res;
17748 break;
17749 case ISD::FMAXNUM:
17750 case ISD::FMINNUM:
17751 case ISD::FMAXNUM_IEEE:
17752 case ISD::FMINNUM_IEEE:
17753 case ISD::FMAXIMUM:
17754 case ISD::FMINIMUM:
17755 case ISD::FMAXIMUMNUM:
17756 case ISD::FMINIMUMNUM:
17757 case ISD::SMAX:
17758 case ISD::SMIN:
17759 case ISD::UMAX:
17760 case ISD::UMIN:
17761 case AMDGPUISD::FMIN_LEGACY:
17762 case AMDGPUISD::FMAX_LEGACY:
17763 return performMinMaxCombine(N, DCI);
17764 case ISD::FMA:
17765 return performFMACombine(N, DCI);
17766 case ISD::AND:
17767 return performAndCombine(N, DCI);
17768 case ISD::OR:
17769 return performOrCombine(N, DCI);
17770 case ISD::FSHR: {
17772 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
17773 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17774 return matchPERM(N, DCI);
17775 }
17776 break;
17777 }
17778 case ISD::XOR:
17779 return performXorCombine(N, DCI);
17780 case ISD::ANY_EXTEND:
17781 case ISD::ZERO_EXTEND:
17782 return performZeroOrAnyExtendCombine(N, DCI);
17784 return performSignExtendInRegCombine(N, DCI);
17785 case AMDGPUISD::FP_CLASS:
17786 return performClassCombine(N, DCI);
17787 case ISD::FCANONICALIZE:
17788 return performFCanonicalizeCombine(N, DCI);
17789 case AMDGPUISD::RCP:
17790 return performRcpCombine(N, DCI);
17791 case ISD::FLDEXP:
17792 case AMDGPUISD::FRACT:
17793 case AMDGPUISD::RSQ:
17794 case AMDGPUISD::RCP_LEGACY:
17795 case AMDGPUISD::RCP_IFLAG:
17796 case AMDGPUISD::RSQ_CLAMP: {
17797 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17798 SDValue Src = N->getOperand(0);
17799 if (Src.isUndef())
17800 return Src;
17801 break;
17802 }
17803 case ISD::SINT_TO_FP:
17804 case ISD::UINT_TO_FP:
17805 return performUCharToFloatCombine(N, DCI);
17806 case ISD::FCOPYSIGN:
17807 return performFCopySignCombine(N, DCI);
17808 case AMDGPUISD::CVT_F32_UBYTE0:
17809 case AMDGPUISD::CVT_F32_UBYTE1:
17810 case AMDGPUISD::CVT_F32_UBYTE2:
17811 case AMDGPUISD::CVT_F32_UBYTE3:
17812 return performCvtF32UByteNCombine(N, DCI);
17813 case AMDGPUISD::FMED3:
17814 return performFMed3Combine(N, DCI);
17815 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17816 return performCvtPkRTZCombine(N, DCI);
17817 case AMDGPUISD::CLAMP:
17818 return performClampCombine(N, DCI);
17819 case ISD::SCALAR_TO_VECTOR: {
17820 SelectionDAG &DAG = DCI.DAG;
17821 EVT VT = N->getValueType(0);
17822
17823 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17824 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17825 SDLoc SL(N);
17826 SDValue Src = N->getOperand(0);
17827 EVT EltVT = Src.getValueType();
17828 if (EltVT != MVT::i16)
17829 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17830
17831 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17832 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17833 }
17834
17835 break;
17836 }
17838 return performExtractVectorEltCombine(N, DCI);
17840 return performInsertVectorEltCombine(N, DCI);
17841 case ISD::FP_ROUND:
17842 return performFPRoundCombine(N, DCI);
17843 case ISD::LOAD: {
17844 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17845 return Widened;
17846 [[fallthrough]];
17847 }
17848 default: {
17849 if (!DCI.isBeforeLegalize()) {
17850 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17851 return performMemSDNodeCombine(MemNode, DCI);
17852 }
17853
17854 break;
17855 }
17856 }
17857
17859}
17860
17861/// Helper function for adjustWritemask
17862static unsigned SubIdx2Lane(unsigned Idx) {
17863 switch (Idx) {
17864 default:
17865 return ~0u;
17866 case AMDGPU::sub0:
17867 return 0;
17868 case AMDGPU::sub1:
17869 return 1;
17870 case AMDGPU::sub2:
17871 return 2;
17872 case AMDGPU::sub3:
17873 return 3;
17874 case AMDGPU::sub4:
17875 return 4; // Possible with TFE/LWE
17876 }
17877}
17878
17879/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17880SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17881 SelectionDAG &DAG) const {
17882 unsigned Opcode = Node->getMachineOpcode();
17883
17884 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17885 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17886 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17887 return Node; // not implemented for D16
17888
17889 SDNode *Users[5] = {nullptr};
17890 unsigned Lane = 0;
17891 unsigned DmaskIdx =
17892 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17893 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17894 unsigned NewDmask = 0;
17895 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17896 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17897 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17898 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17899 unsigned TFCLane = 0;
17900 bool HasChain = Node->getNumValues() > 1;
17901
17902 if (OldDmask == 0) {
17903 // These are folded out, but on the chance it happens don't assert.
17904 return Node;
17905 }
17906
17907 unsigned OldBitsSet = llvm::popcount(OldDmask);
17908 // Work out which is the TFE/LWE lane if that is enabled.
17909 if (UsesTFC) {
17910 TFCLane = OldBitsSet;
17911 }
17912
17913 // Try to figure out the used register components
17914 for (SDUse &Use : Node->uses()) {
17915
17916 // Don't look at users of the chain.
17917 if (Use.getResNo() != 0)
17918 continue;
17919
17920 SDNode *User = Use.getUser();
17921
17922 // Abort if we can't understand the usage
17923 if (!User->isMachineOpcode() ||
17924 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17925 return Node;
17926
17927 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17928 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17929 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17930 // set, etc.
17931 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17932 if (Lane == ~0u)
17933 return Node;
17934
17935 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17936 if (UsesTFC && Lane == TFCLane) {
17937 Users[Lane] = User;
17938 } else {
17939 // Set which texture component corresponds to the lane.
17940 unsigned Comp;
17941 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17942 Comp = llvm::countr_zero(Dmask);
17943 Dmask &= ~(1 << Comp);
17944 }
17945
17946 // Abort if we have more than one user per component.
17947 if (Users[Lane])
17948 return Node;
17949
17950 Users[Lane] = User;
17951 NewDmask |= 1 << Comp;
17952 }
17953 }
17954
17955 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17956 bool NoChannels = !NewDmask;
17957 if (NoChannels) {
17958 if (!UsesTFC) {
17959 // No uses of the result and not using TFC. Then do nothing.
17960 return Node;
17961 }
17962 // If the original dmask has one channel - then nothing to do
17963 if (OldBitsSet == 1)
17964 return Node;
17965 // Use an arbitrary dmask - required for the instruction to work
17966 NewDmask = 1;
17967 }
17968 // Abort if there's no change
17969 if (NewDmask == OldDmask)
17970 return Node;
17971
17972 unsigned BitsSet = llvm::popcount(NewDmask);
17973
17974 // Check for TFE or LWE - increase the number of channels by one to account
17975 // for the extra return value
17976 // This will need adjustment for D16 if this is also included in
17977 // adjustWriteMask (this function) but at present D16 are excluded.
17978 unsigned NewChannels = BitsSet + UsesTFC;
17979
17980 int NewOpcode =
17981 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17982 assert(NewOpcode != -1 &&
17983 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17984 "failed to find equivalent MIMG op");
17985
17986 // Adjust the writemask in the node
17988 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17989 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17990 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17991
17992 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17993
17994 MVT ResultVT = NewChannels == 1
17995 ? SVT
17996 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17997 : NewChannels == 5 ? 8
17998 : NewChannels);
17999 SDVTList NewVTList =
18000 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
18001
18002 MachineSDNode *NewNode =
18003 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
18004
18005 if (HasChain) {
18006 // Update chain.
18007 DAG.setNodeMemRefs(NewNode, Node->memoperands());
18008 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
18009 }
18010
18011 if (NewChannels == 1) {
18012 assert(Node->hasNUsesOfValue(1, 0));
18013 SDNode *Copy =
18014 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
18015 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
18016 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
18017 return nullptr;
18018 }
18019
18020 // Update the users of the node with the new indices
18021 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
18022 SDNode *User = Users[i];
18023 if (!User) {
18024 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
18025 // Users[0] is still nullptr because channel 0 doesn't really have a use.
18026 if (i || !NoChannels)
18027 continue;
18028 } else {
18029 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
18030 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
18031 if (NewUser != User) {
18032 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
18033 DAG.RemoveDeadNode(User);
18034 }
18035 }
18036
18037 switch (Idx) {
18038 default:
18039 break;
18040 case AMDGPU::sub0:
18041 Idx = AMDGPU::sub1;
18042 break;
18043 case AMDGPU::sub1:
18044 Idx = AMDGPU::sub2;
18045 break;
18046 case AMDGPU::sub2:
18047 Idx = AMDGPU::sub3;
18048 break;
18049 case AMDGPU::sub3:
18050 Idx = AMDGPU::sub4;
18051 break;
18052 }
18053 }
18054
18055 DAG.RemoveDeadNode(Node);
18056 return nullptr;
18057}
18058
18060 if (Op.getOpcode() == ISD::AssertZext)
18061 Op = Op.getOperand(0);
18062
18063 return isa<FrameIndexSDNode>(Op);
18064}
18065
18066/// Legalize target independent instructions (e.g. INSERT_SUBREG)
18067/// with frame index operands.
18068/// LLVM assumes that inputs are to these instructions are registers.
18069SDNode *
18071 SelectionDAG &DAG) const {
18072 if (Node->getOpcode() == ISD::CopyToReg) {
18073 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
18074 SDValue SrcVal = Node->getOperand(2);
18075
18076 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
18077 // to try understanding copies to physical registers.
18078 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
18079 SDLoc SL(Node);
18081 SDValue VReg = DAG.getRegister(
18082 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
18083
18084 SDNode *Glued = Node->getGluedNode();
18085 SDValue ToVReg = DAG.getCopyToReg(
18086 Node->getOperand(0), SL, VReg, SrcVal,
18087 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
18088 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
18089 VReg, ToVReg.getValue(1));
18090 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
18091 DAG.RemoveDeadNode(Node);
18092 return ToResultReg.getNode();
18093 }
18094 }
18095
18097 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
18098 if (!isFrameIndexOp(Node->getOperand(i))) {
18099 Ops.push_back(Node->getOperand(i));
18100 continue;
18101 }
18102
18103 SDLoc DL(Node);
18104 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
18105 Node->getOperand(i).getValueType(),
18106 Node->getOperand(i)),
18107 0));
18108 }
18109
18110 return DAG.UpdateNodeOperands(Node, Ops);
18111}
18112
18113/// Fold the instructions after selecting them.
18114/// Returns null if users were already updated.
18116 SelectionDAG &DAG) const {
18118 unsigned Opcode = Node->getMachineOpcode();
18119
18120 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
18121 !TII->isGather4(Opcode) &&
18122 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
18123 return adjustWritemask(Node, DAG);
18124 }
18125
18126 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
18128 return Node;
18129 }
18130
18131 switch (Opcode) {
18132 case AMDGPU::V_DIV_SCALE_F32_e64:
18133 case AMDGPU::V_DIV_SCALE_F64_e64: {
18134 // Satisfy the operand register constraint when one of the inputs is
18135 // undefined. Ordinarily each undef value will have its own implicit_def of
18136 // a vreg, so force these to use a single register.
18137 SDValue Src0 = Node->getOperand(1);
18138 SDValue Src1 = Node->getOperand(3);
18139 SDValue Src2 = Node->getOperand(5);
18140
18141 if ((Src0.isMachineOpcode() &&
18142 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
18143 (Src0 == Src1 || Src0 == Src2))
18144 break;
18145
18146 MVT VT = Src0.getValueType().getSimpleVT();
18147 const TargetRegisterClass *RC =
18148 getRegClassFor(VT, Src0.getNode()->isDivergent());
18149
18151 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
18152
18153 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
18154 Src0, SDValue());
18155
18156 // src0 must be the same register as src1 or src2, even if the value is
18157 // undefined, so make sure we don't violate this constraint.
18158 if (Src0.isMachineOpcode() &&
18159 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
18160 if (Src1.isMachineOpcode() &&
18161 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
18162 Src0 = Src1;
18163 else if (Src2.isMachineOpcode() &&
18164 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
18165 Src0 = Src2;
18166 else {
18167 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
18168 Src0 = UndefReg;
18169 Src1 = UndefReg;
18170 }
18171 } else
18172 break;
18173
18175 Ops[1] = Src0;
18176 Ops[3] = Src1;
18177 Ops[5] = Src2;
18178 Ops.push_back(ImpDef.getValue(1));
18179 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
18180 }
18181 default:
18182 break;
18183 }
18184
18185 return Node;
18186}
18187
18188// Any MIMG instructions that use tfe or lwe require an initialization of the
18189// result register that will be written in the case of a memory access failure.
18190// The required code is also added to tie this init code to the result of the
18191// img instruction.
18194 const SIRegisterInfo &TRI = TII->getRegisterInfo();
18195 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
18196 MachineBasicBlock &MBB = *MI.getParent();
18197
18198 int DstIdx =
18199 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
18200 unsigned InitIdx = 0;
18201
18202 if (TII->isImage(MI)) {
18203 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
18204 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
18205 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
18206
18207 if (!TFE && !LWE) // intersect_ray
18208 return;
18209
18210 unsigned TFEVal = TFE ? TFE->getImm() : 0;
18211 unsigned LWEVal = LWE ? LWE->getImm() : 0;
18212 unsigned D16Val = D16 ? D16->getImm() : 0;
18213
18214 if (!TFEVal && !LWEVal)
18215 return;
18216
18217 // At least one of TFE or LWE are non-zero
18218 // We have to insert a suitable initialization of the result value and
18219 // tie this to the dest of the image instruction.
18220
18221 // Calculate which dword we have to initialize to 0.
18222 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
18223
18224 // check that dmask operand is found.
18225 assert(MO_Dmask && "Expected dmask operand in instruction");
18226
18227 unsigned dmask = MO_Dmask->getImm();
18228 // Determine the number of active lanes taking into account the
18229 // Gather4 special case
18230 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
18231
18232 bool Packed = !Subtarget->hasUnpackedD16VMem();
18233
18234 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18235
18236 // Abandon attempt if the dst size isn't large enough
18237 // - this is in fact an error but this is picked up elsewhere and
18238 // reported correctly.
18239 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
18240
18241 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
18242 if (DstSize < InitIdx)
18243 return;
18244 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
18245 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
18246 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
18247 } else {
18248 return;
18249 }
18250
18251 const DebugLoc &DL = MI.getDebugLoc();
18252
18253 // Create a register for the initialization value.
18254 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
18255 unsigned NewDst = 0; // Final initialized value will be in here
18256
18257 // If PRTStrictNull feature is enabled (the default) then initialize
18258 // all the result registers to 0, otherwise just the error indication
18259 // register (VGPRn+1)
18260 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18261 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18262
18263 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
18264 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18265 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
18266 // Initialize dword
18267 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
18268 // clang-format off
18269 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
18270 .addImm(0);
18271 // clang-format on
18272 // Insert into the super-reg
18273 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
18274 .addReg(PrevDst)
18275 .addReg(SubReg)
18277
18278 PrevDst = NewDst;
18279 }
18280
18281 // Add as an implicit operand
18282 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
18283
18284 // Tie the just added implicit operand to the dst
18285 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
18286}
18287
18288/// Assign the register class depending on the number of
18289/// bits set in the writemask
18291 SDNode *Node) const {
18293
18294 MachineFunction *MF = MI.getMF();
18295 MachineRegisterInfo &MRI = MF->getRegInfo();
18296
18297 if (TII->isVOP3(MI.getOpcode())) {
18298 // Make sure constant bus requirements are respected.
18299 TII->legalizeOperandsVOP3(MRI, MI);
18300
18301 if (TII->isMAI(MI)) {
18302 // The ordinary src0, src1, src2 were legalized above.
18303 //
18304 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
18305 // as a separate instruction.
18306 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
18307 AMDGPU::OpName::scale_src0);
18308 if (Src0Idx != -1) {
18309 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
18310 AMDGPU::OpName::scale_src1);
18311 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
18312 TII->usesConstantBus(MRI, MI, Src1Idx))
18313 TII->legalizeOpWithMove(MI, Src1Idx);
18314 }
18315 }
18316
18317 return;
18318 }
18319
18320 if (TII->isImage(MI))
18321 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
18322}
18323
18325 uint64_t Val) {
18326 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
18327 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
18328}
18329
18331 const SDLoc &DL,
18332 SDValue Ptr) const {
18334
18335 // Build the half of the subregister with the constants before building the
18336 // full 128-bit register. If we are building multiple resource descriptors,
18337 // this will allow CSEing of the 2-component register.
18338 const SDValue Ops0[] = {
18339 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
18340 buildSMovImm32(DAG, DL, 0),
18341 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
18342 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
18343 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
18344
18345 SDValue SubRegHi = SDValue(
18346 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
18347
18348 // Combine the constants and the pointer.
18349 const SDValue Ops1[] = {
18350 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
18351 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
18352 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
18353
18354 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
18355}
18356
18357/// Return a resource descriptor with the 'Add TID' bit enabled
18358/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
18359/// of the resource descriptor) to create an offset, which is added to
18360/// the resource pointer.
18362 SDValue Ptr, uint32_t RsrcDword1,
18363 uint64_t RsrcDword2And3) const {
18364 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
18365 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
18366 if (RsrcDword1) {
18367 PtrHi =
18368 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
18369 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
18370 0);
18371 }
18372
18373 SDValue DataLo =
18374 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
18375 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
18376
18377 const SDValue Ops[] = {
18378 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
18379 PtrLo,
18380 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
18381 PtrHi,
18382 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
18383 DataLo,
18384 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
18385 DataHi,
18386 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
18387
18388 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
18389}
18390
18391//===----------------------------------------------------------------------===//
18392// SI Inline Assembly Support
18393//===----------------------------------------------------------------------===//
18394
18395std::pair<unsigned, const TargetRegisterClass *>
18397 StringRef Constraint,
18398 MVT VT) const {
18399 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
18400
18401 const TargetRegisterClass *RC = nullptr;
18402 if (Constraint.size() == 1) {
18403 // Check if we cannot determine the bit size of the given value type. This
18404 // can happen, for example, in this situation where we have an empty struct
18405 // (size 0): `call void asm "", "v"({} poison)`-
18406 if (VT == MVT::Other)
18407 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18408 const unsigned BitWidth = VT.getSizeInBits();
18409 switch (Constraint[0]) {
18410 default:
18411 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18412 case 's':
18413 case 'r':
18414 switch (BitWidth) {
18415 case 16:
18416 RC = &AMDGPU::SReg_32RegClass;
18417 break;
18418 case 64:
18419 RC = &AMDGPU::SGPR_64RegClass;
18420 break;
18421 default:
18423 if (!RC)
18424 return std::pair(0U, nullptr);
18425 break;
18426 }
18427 break;
18428 case 'v':
18429 switch (BitWidth) {
18430 case 1:
18431 return std::pair(0U, nullptr);
18432 case 16:
18433 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18434 : &AMDGPU::VGPR_32_Lo256RegClass;
18435 break;
18436 default:
18437 RC = Subtarget->has1024AddressableVGPRs()
18438 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
18439 : TRI->getVGPRClassForBitWidth(BitWidth);
18440 if (!RC)
18441 return std::pair(0U, nullptr);
18442 break;
18443 }
18444 break;
18445 case 'a':
18446 if (!Subtarget->hasMAIInsts())
18447 break;
18448 switch (BitWidth) {
18449 case 1:
18450 return std::pair(0U, nullptr);
18451 case 16:
18452 RC = &AMDGPU::AGPR_32RegClass;
18453 break;
18454 default:
18455 RC = TRI->getAGPRClassForBitWidth(BitWidth);
18456 if (!RC)
18457 return std::pair(0U, nullptr);
18458 break;
18459 }
18460 break;
18461 }
18462 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
18463 const unsigned BitWidth = VT.getSizeInBits();
18464 switch (BitWidth) {
18465 case 16:
18466 RC = &AMDGPU::AV_32RegClass;
18467 break;
18468 default:
18469 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
18470 if (!RC)
18471 return std::pair(0U, nullptr);
18472 break;
18473 }
18474 }
18475
18476 // We actually support i128, i16 and f16 as inline parameters
18477 // even if they are not reported as legal
18478 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
18479 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
18480 return std::pair(0U, RC);
18481
18482 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
18483 if (Kind != '\0') {
18484 if (Kind == 'v') {
18485 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18486 } else if (Kind == 's') {
18487 RC = &AMDGPU::SGPR_32RegClass;
18488 } else if (Kind == 'a') {
18489 RC = &AMDGPU::AGPR_32RegClass;
18490 }
18491
18492 if (RC) {
18493 if (NumRegs > 1) {
18494 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
18495 return std::pair(0U, nullptr);
18496
18497 uint32_t Width = NumRegs * 32;
18498 // Prohibit constraints for register ranges with a width that does not
18499 // match the required type.
18500 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
18501 return std::pair(0U, nullptr);
18502
18503 MCRegister Reg = RC->getRegister(Idx);
18505 RC = TRI->getVGPRClassForBitWidth(Width);
18506 else if (SIRegisterInfo::isSGPRClass(RC))
18507 RC = TRI->getSGPRClassForBitWidth(Width);
18508 else if (SIRegisterInfo::isAGPRClass(RC))
18509 RC = TRI->getAGPRClassForBitWidth(Width);
18510 if (RC) {
18511 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
18512 if (!Reg) {
18513 // The register class does not contain the requested register,
18514 // e.g., because it is an SGPR pair that would violate alignment
18515 // requirements.
18516 return std::pair(0U, nullptr);
18517 }
18518 return std::pair(Reg, RC);
18519 }
18520 }
18521
18522 // Check for lossy scalar/vector conversions.
18523 if (VT.isVector() && VT.getSizeInBits() != 32)
18524 return std::pair(0U, nullptr);
18525 if (Idx < RC->getNumRegs())
18526 return std::pair(RC->getRegister(Idx), RC);
18527 return std::pair(0U, nullptr);
18528 }
18529 }
18530
18531 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18532 if (Ret.first)
18533 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
18534
18535 return Ret;
18536}
18537
18538static bool isImmConstraint(StringRef Constraint) {
18539 if (Constraint.size() == 1) {
18540 switch (Constraint[0]) {
18541 default:
18542 break;
18543 case 'I':
18544 case 'J':
18545 case 'A':
18546 case 'B':
18547 case 'C':
18548 return true;
18549 }
18550 } else if (Constraint == "DA" || Constraint == "DB") {
18551 return true;
18552 }
18553 return false;
18554}
18555
18558 if (Constraint.size() == 1) {
18559 switch (Constraint[0]) {
18560 default:
18561 break;
18562 case 's':
18563 case 'v':
18564 case 'a':
18565 return C_RegisterClass;
18566 }
18567 } else if (Constraint.size() == 2) {
18568 if (Constraint == "VA")
18569 return C_RegisterClass;
18570 }
18571 if (isImmConstraint(Constraint)) {
18572 return C_Other;
18573 }
18574 return TargetLowering::getConstraintType(Constraint);
18575}
18576
18577static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
18579 Val = Val & maskTrailingOnes<uint64_t>(Size);
18580 }
18581 return Val;
18582}
18583
18585 StringRef Constraint,
18586 std::vector<SDValue> &Ops,
18587 SelectionDAG &DAG) const {
18588 if (isImmConstraint(Constraint)) {
18589 uint64_t Val;
18590 if (getAsmOperandConstVal(Op, Val) &&
18591 checkAsmConstraintVal(Op, Constraint, Val)) {
18592 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
18593 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
18594 }
18595 } else {
18597 }
18598}
18599
18601 unsigned Size = Op.getScalarValueSizeInBits();
18602 if (Size > 64)
18603 return false;
18604
18605 if (Size == 16 && !Subtarget->has16BitInsts())
18606 return false;
18607
18609 Val = C->getSExtValue();
18610 return true;
18611 }
18613 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18614 return true;
18615 }
18617 if (Size != 16 || Op.getNumOperands() != 2)
18618 return false;
18619 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
18620 return false;
18621 if (ConstantSDNode *C = V->getConstantSplatNode()) {
18622 Val = C->getSExtValue();
18623 return true;
18624 }
18625 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
18626 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18627 return true;
18628 }
18629 }
18630
18631 return false;
18632}
18633
18635 uint64_t Val) const {
18636 if (Constraint.size() == 1) {
18637 switch (Constraint[0]) {
18638 case 'I':
18640 case 'J':
18641 return isInt<16>(Val);
18642 case 'A':
18643 return checkAsmConstraintValA(Op, Val);
18644 case 'B':
18645 return isInt<32>(Val);
18646 case 'C':
18647 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
18649 default:
18650 break;
18651 }
18652 } else if (Constraint.size() == 2) {
18653 if (Constraint == "DA") {
18654 int64_t HiBits = static_cast<int32_t>(Val >> 32);
18655 int64_t LoBits = static_cast<int32_t>(Val);
18656 return checkAsmConstraintValA(Op, HiBits, 32) &&
18657 checkAsmConstraintValA(Op, LoBits, 32);
18658 }
18659 if (Constraint == "DB") {
18660 return true;
18661 }
18662 }
18663 llvm_unreachable("Invalid asm constraint");
18664}
18665
18667 unsigned MaxSize) const {
18668 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
18669 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18670 if (Size == 16) {
18671 MVT VT = Op.getSimpleValueType();
18672 switch (VT.SimpleTy) {
18673 default:
18674 return false;
18675 case MVT::i16:
18676 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
18677 case MVT::f16:
18678 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
18679 case MVT::bf16:
18680 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
18681 case MVT::v2i16:
18682 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
18683 case MVT::v2f16:
18684 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
18685 case MVT::v2bf16:
18686 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
18687 }
18688 }
18689 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
18690 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
18691 return true;
18692 return false;
18693}
18694
18695static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18696 switch (UnalignedClassID) {
18697 case AMDGPU::VReg_64RegClassID:
18698 return AMDGPU::VReg_64_Align2RegClassID;
18699 case AMDGPU::VReg_96RegClassID:
18700 return AMDGPU::VReg_96_Align2RegClassID;
18701 case AMDGPU::VReg_128RegClassID:
18702 return AMDGPU::VReg_128_Align2RegClassID;
18703 case AMDGPU::VReg_160RegClassID:
18704 return AMDGPU::VReg_160_Align2RegClassID;
18705 case AMDGPU::VReg_192RegClassID:
18706 return AMDGPU::VReg_192_Align2RegClassID;
18707 case AMDGPU::VReg_224RegClassID:
18708 return AMDGPU::VReg_224_Align2RegClassID;
18709 case AMDGPU::VReg_256RegClassID:
18710 return AMDGPU::VReg_256_Align2RegClassID;
18711 case AMDGPU::VReg_288RegClassID:
18712 return AMDGPU::VReg_288_Align2RegClassID;
18713 case AMDGPU::VReg_320RegClassID:
18714 return AMDGPU::VReg_320_Align2RegClassID;
18715 case AMDGPU::VReg_352RegClassID:
18716 return AMDGPU::VReg_352_Align2RegClassID;
18717 case AMDGPU::VReg_384RegClassID:
18718 return AMDGPU::VReg_384_Align2RegClassID;
18719 case AMDGPU::VReg_512RegClassID:
18720 return AMDGPU::VReg_512_Align2RegClassID;
18721 case AMDGPU::VReg_1024RegClassID:
18722 return AMDGPU::VReg_1024_Align2RegClassID;
18723 case AMDGPU::AReg_64RegClassID:
18724 return AMDGPU::AReg_64_Align2RegClassID;
18725 case AMDGPU::AReg_96RegClassID:
18726 return AMDGPU::AReg_96_Align2RegClassID;
18727 case AMDGPU::AReg_128RegClassID:
18728 return AMDGPU::AReg_128_Align2RegClassID;
18729 case AMDGPU::AReg_160RegClassID:
18730 return AMDGPU::AReg_160_Align2RegClassID;
18731 case AMDGPU::AReg_192RegClassID:
18732 return AMDGPU::AReg_192_Align2RegClassID;
18733 case AMDGPU::AReg_256RegClassID:
18734 return AMDGPU::AReg_256_Align2RegClassID;
18735 case AMDGPU::AReg_512RegClassID:
18736 return AMDGPU::AReg_512_Align2RegClassID;
18737 case AMDGPU::AReg_1024RegClassID:
18738 return AMDGPU::AReg_1024_Align2RegClassID;
18739 default:
18740 return -1;
18741 }
18742}
18743
18744// Figure out which registers should be reserved for stack access. Only after
18745// the function is legalized do we know all of the non-spill stack objects or if
18746// calls are present.
18748 MachineRegisterInfo &MRI = MF.getRegInfo();
18750 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18751 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18752 const SIInstrInfo *TII = ST.getInstrInfo();
18753
18754 if (Info->isEntryFunction()) {
18755 // Callable functions have fixed registers used for stack access.
18757 }
18758
18759 // TODO: Move this logic to getReservedRegs()
18760 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18761 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18762 Register SReg = ST.isWave32()
18763 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18764 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18765 &AMDGPU::SGPR_64RegClass);
18766 Info->setSGPRForEXECCopy(SReg);
18767
18768 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18769 Info->getStackPtrOffsetReg()));
18770 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18771 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18772
18773 // We need to worry about replacing the default register with itself in case
18774 // of MIR testcases missing the MFI.
18775 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18776 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18777
18778 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18779 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18780
18781 Info->limitOccupancy(MF);
18782
18783 if (ST.isWave32() && !MF.empty()) {
18784 for (auto &MBB : MF) {
18785 for (auto &MI : MBB) {
18786 TII->fixImplicitOperands(MI);
18787 }
18788 }
18789 }
18790
18791 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18792 // classes if required. Ideally the register class constraints would differ
18793 // per-subtarget, but there's no easy way to achieve that right now. This is
18794 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18795 // from using them as the register class for legal types.
18796 if (ST.needsAlignedVGPRs()) {
18797 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18798 const Register Reg = Register::index2VirtReg(I);
18799 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18800 if (!RC)
18801 continue;
18802 int NewClassID = getAlignedAGPRClassID(RC->getID());
18803 if (NewClassID != -1)
18804 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18805 }
18806 }
18807
18809}
18810
18812 KnownBits &Known,
18813 const APInt &DemandedElts,
18814 const SelectionDAG &DAG,
18815 unsigned Depth) const {
18816 Known.resetAll();
18817 unsigned Opc = Op.getOpcode();
18818 switch (Opc) {
18820 unsigned IID = Op.getConstantOperandVal(0);
18821 switch (IID) {
18822 case Intrinsic::amdgcn_mbcnt_lo:
18823 case Intrinsic::amdgcn_mbcnt_hi: {
18824 const GCNSubtarget &ST =
18826 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18827 // most 31 + src1.
18828 Known.Zero.setBitsFrom(
18829 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18830 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18831 Known = KnownBits::add(Known, Known2);
18832 return;
18833 }
18834 }
18835 break;
18836 }
18837 }
18839 Op, Known, DemandedElts, DAG, Depth);
18840}
18841
18843 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18845
18846 // Set the high bits to zero based on the maximum allowed scratch size per
18847 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18848 // calculation won't overflow, so assume the sign bit is never set.
18849 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18850}
18851
18853 GISelValueTracking &VT, KnownBits &Known,
18854 unsigned Dim) {
18855 unsigned MaxValue =
18856 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18857 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18858}
18859
18861 KnownBits &Known, const APInt &DemandedElts,
18862 unsigned BFEWidth, bool SExt, unsigned Depth) {
18864 const MachineOperand &Src1 = MI.getOperand(2);
18865
18866 unsigned Src1Cst = 0;
18867 if (Src1.isImm()) {
18868 Src1Cst = Src1.getImm();
18869 } else if (Src1.isReg()) {
18870 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18871 if (!Cst)
18872 return;
18873 Src1Cst = Cst->Value.getZExtValue();
18874 } else {
18875 return;
18876 }
18877
18878 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18879 // Width is always [22:16].
18880 const unsigned Offset =
18881 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18882 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18883
18884 if (Width >= BFEWidth) // Ill-formed.
18885 return;
18886
18887 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18888 Depth + 1);
18889
18890 Known = Known.extractBits(Width, Offset);
18891
18892 if (SExt)
18893 Known = Known.sext(BFEWidth);
18894 else
18895 Known = Known.zext(BFEWidth);
18896}
18897
18899 GISelValueTracking &VT, Register R, KnownBits &Known,
18900 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18901 unsigned Depth) const {
18902 Known.resetAll();
18903 const MachineInstr *MI = MRI.getVRegDef(R);
18904 switch (MI->getOpcode()) {
18905 case AMDGPU::S_BFE_I32:
18906 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18907 /*SExt=*/true, Depth);
18908 case AMDGPU::S_BFE_U32:
18909 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18910 /*SExt=*/false, Depth);
18911 case AMDGPU::S_BFE_I64:
18912 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18913 /*SExt=*/true, Depth);
18914 case AMDGPU::S_BFE_U64:
18915 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18916 /*SExt=*/false, Depth);
18917 case AMDGPU::G_INTRINSIC:
18918 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18919 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18920 switch (IID) {
18921 case Intrinsic::amdgcn_workitem_id_x:
18922 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18923 break;
18924 case Intrinsic::amdgcn_workitem_id_y:
18925 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18926 break;
18927 case Intrinsic::amdgcn_workitem_id_z:
18928 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18929 break;
18930 case Intrinsic::amdgcn_mbcnt_lo:
18931 case Intrinsic::amdgcn_mbcnt_hi: {
18932 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18933 // most 31 + src1.
18934 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18935 ? getSubtarget()->getWavefrontSizeLog2()
18936 : 5);
18937 KnownBits Known2;
18938 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18939 Depth + 1);
18940 Known = KnownBits::add(Known, Known2);
18941 break;
18942 }
18943 case Intrinsic::amdgcn_groupstaticsize: {
18944 // We can report everything over the maximum size as 0. We can't report
18945 // based on the actual size because we don't know if it's accurate or not
18946 // at any given point.
18947 Known.Zero.setHighBits(
18948 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18949 break;
18950 }
18951 }
18952 break;
18953 }
18954 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18955 Known.Zero.setHighBits(24);
18956 break;
18957 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18958 Known.Zero.setHighBits(16);
18959 break;
18960 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
18961 // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
18962 // producing exactly 0 or 1.
18963 Known.Zero.setHighBits(Known.getBitWidth() - 1);
18964 break;
18965 case AMDGPU::G_AMDGPU_SMED3:
18966 case AMDGPU::G_AMDGPU_UMED3: {
18967 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18968
18969 KnownBits Known2;
18970 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18971 if (Known2.isUnknown())
18972 break;
18973
18974 KnownBits Known1;
18975 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18976 if (Known1.isUnknown())
18977 break;
18978
18979 KnownBits Known0;
18980 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18981 if (Known0.isUnknown())
18982 break;
18983
18984 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18985 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18986 Known.One = Known0.One & Known1.One & Known2.One;
18987 break;
18988 }
18989 }
18990}
18991
18994 unsigned Depth) const {
18995 const MachineInstr *MI = MRI.getVRegDef(R);
18996 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18997 // FIXME: Can this move to generic code? What about the case where the call
18998 // site specifies a lower alignment?
18999 Intrinsic::ID IID = GI->getIntrinsicID();
19001 AttributeList Attrs =
19002 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
19003 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
19004 return *RetAlign;
19005 }
19006 return Align(1);
19007}
19008
19011 const Align CacheLineAlign = Align(64);
19012
19013 // GFX950: Prevent an 8-byte instruction at loop header from being split by
19014 // the 32-byte instruction fetch window boundary. This avoids a significant
19015 // fetch delay after backward branch. We use 32-byte alignment with max
19016 // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
19017 if (ML && !DisableLoopAlignment &&
19018 getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
19019 const MachineBasicBlock *Header = ML->getHeader();
19020 // Respect user-specified or previously set alignment.
19021 if (Header->getAlignment() != PrefAlign)
19022 return Header->getAlignment();
19023 if (needsFetchWindowAlignment(*Header))
19024 return Align(32);
19025 }
19026
19027 // Pre-GFX10 target did not benefit from loop alignment
19028 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
19029 getSubtarget()->hasInstFwdPrefetchBug())
19030 return PrefAlign;
19031
19032 // On GFX10 I$ is 4 x 64 bytes cache lines.
19033 // By default prefetcher keeps one cache line behind and reads two ahead.
19034 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
19035 // behind and one ahead.
19036 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
19037 // If loop fits 64 bytes it always spans no more than two cache lines and
19038 // does not need an alignment.
19039 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
19040 // Else if loop is less or equal 192 bytes we need two lines behind.
19041
19043 const MachineBasicBlock *Header = ML->getHeader();
19044 if (Header->getAlignment() != PrefAlign)
19045 return Header->getAlignment(); // Already processed.
19046
19047 unsigned LoopSize = 0;
19048 for (const MachineBasicBlock *MBB : ML->blocks()) {
19049 // If inner loop block is aligned assume in average half of the alignment
19050 // size to be added as nops.
19051 if (MBB != Header)
19052 LoopSize += MBB->getAlignment().value() / 2;
19053
19054 for (const MachineInstr &MI : *MBB) {
19055 LoopSize += TII->getInstSizeInBytes(MI);
19056 if (LoopSize > 192)
19057 return PrefAlign;
19058 }
19059 }
19060
19061 if (LoopSize <= 64)
19062 return PrefAlign;
19063
19064 if (LoopSize <= 128)
19065 return CacheLineAlign;
19066
19067 // If any of parent loops is surrounded by prefetch instructions do not
19068 // insert new for inner loop, which would reset parent's settings.
19069 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
19070 if (MachineBasicBlock *Exit = P->getExitBlock()) {
19071 auto I = Exit->getFirstNonDebugInstr();
19072 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
19073 return CacheLineAlign;
19074 }
19075 }
19076
19077 MachineBasicBlock *Pre = ML->getLoopPreheader();
19078 MachineBasicBlock *Exit = ML->getExitBlock();
19079
19080 if (Pre && Exit) {
19081 auto PreTerm = Pre->getFirstTerminator();
19082 if (PreTerm == Pre->begin() ||
19083 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
19084 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
19085 .addImm(1); // prefetch 2 lines behind PC
19086
19087 auto ExitHead = Exit->getFirstNonDebugInstr();
19088 if (ExitHead == Exit->end() ||
19089 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
19090 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
19091 .addImm(2); // prefetch 1 line behind PC
19092 }
19093
19094 return CacheLineAlign;
19095}
19096
19098 MachineBasicBlock *MBB) const {
19099 // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
19100 // instruction could be split by the 32-byte fetch window boundary.
19101 // See getPrefLoopAlignment() for context.
19102 if (needsFetchWindowAlignment(*MBB))
19103 return 4;
19105}
19106
19107bool SITargetLowering::needsFetchWindowAlignment(
19108 const MachineBasicBlock &MBB) const {
19109 if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
19110 return false;
19112 for (const MachineInstr &MI : MBB) {
19113 if (MI.isMetaInstruction())
19114 continue;
19115 // Instructions larger than 4 bytes can be split by a 32-byte boundary.
19116 return TII->getInstSizeInBytes(MI) > 4;
19117 }
19118 return false;
19119}
19120
19121[[maybe_unused]]
19122static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
19123 assert(N->getOpcode() == ISD::CopyFromReg);
19124 do {
19125 // Follow the chain until we find an INLINEASM node.
19126 N = N->getOperand(0).getNode();
19127 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
19128 return true;
19129 } while (N->getOpcode() == ISD::CopyFromReg);
19130 return false;
19131}
19132
19135 UniformityInfo *UA) const {
19136 switch (N->getOpcode()) {
19137 case ISD::CopyFromReg: {
19138 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
19139 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
19140 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19141 Register Reg = R->getReg();
19142
19143 // FIXME: Why does this need to consider isLiveIn?
19144 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
19145 return !TRI->isSGPRReg(MRI, Reg);
19146
19147 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
19148 return UA->isDivergent(V);
19149
19151 return !TRI->isSGPRReg(MRI, Reg);
19152 }
19153 case ISD::LOAD: {
19154 const LoadSDNode *L = cast<LoadSDNode>(N);
19155 unsigned AS = L->getAddressSpace();
19156 // A flat load may access private memory.
19158 }
19159 case ISD::CALLSEQ_END:
19160 return true;
19162 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
19164 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
19165 case AMDGPUISD::ATOMIC_CMP_SWAP:
19166 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
19167 case AMDGPUISD::BUFFER_ATOMIC_ADD:
19168 case AMDGPUISD::BUFFER_ATOMIC_SUB:
19169 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
19170 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
19171 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
19172 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
19173 case AMDGPUISD::BUFFER_ATOMIC_AND:
19174 case AMDGPUISD::BUFFER_ATOMIC_OR:
19175 case AMDGPUISD::BUFFER_ATOMIC_XOR:
19176 case AMDGPUISD::BUFFER_ATOMIC_INC:
19177 case AMDGPUISD::BUFFER_ATOMIC_DEC:
19178 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
19179 case AMDGPUISD::BUFFER_ATOMIC_FADD:
19180 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
19181 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
19182 // Target-specific read-modify-write atomics are sources of divergence.
19183 return true;
19184 default:
19185 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
19186 // Generic read-modify-write atomics are sources of divergence.
19187 return A->readMem() && A->writeMem();
19188 }
19189 return false;
19190 }
19191}
19192
19194 EVT VT) const {
19195 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
19196 case MVT::f32:
19198 case MVT::f64:
19199 case MVT::f16:
19201 default:
19202 return false;
19203 }
19204}
19205
19207 LLT Ty, const MachineFunction &MF) const {
19208 switch (Ty.getScalarSizeInBits()) {
19209 case 32:
19210 return !denormalModeIsFlushAllF32(MF);
19211 case 64:
19212 case 16:
19213 return !denormalModeIsFlushAllF64F16(MF);
19214 default:
19215 return false;
19216 }
19217}
19218
19220 const APInt &DemandedElts,
19221 const SelectionDAG &DAG,
19222 bool SNaN,
19223 unsigned Depth) const {
19224 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
19225 const MachineFunction &MF = DAG.getMachineFunction();
19227
19228 if (Info->getMode().DX10Clamp)
19229 return true; // Clamped to 0.
19230 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
19231 }
19232
19234 DAG, SNaN, Depth);
19235}
19236
19237// On older subtargets, global FP atomic instructions have a hardcoded FP mode
19238// and do not support FP32 denormals, and only support v2f16/f64 denormals.
19240 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
19241 return true;
19242
19243 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
19244 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
19245 if (DenormMode == DenormalMode::getPreserveSign())
19246 return true;
19247
19248 // TODO: Remove this.
19249 return RMW->getFunction()
19250 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
19251 .getValueAsBool();
19252}
19253
19255 LLVMContext &Ctx = RMW->getContext();
19256 StringRef MemScope =
19257 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
19258
19259 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
19260 << "Hardware instruction generated for atomic "
19261 << RMW->getOperationName(RMW->getOperation())
19262 << " operation at memory scope " << MemScope;
19263}
19264
19265static bool isV2F16OrV2BF16(Type *Ty) {
19266 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
19267 Type *EltTy = VT->getElementType();
19268 return VT->getNumElements() == 2 &&
19269 (EltTy->isHalfTy() || EltTy->isBFloatTy());
19270 }
19271
19272 return false;
19273}
19274
19275static bool isV2F16(Type *Ty) {
19277 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
19278}
19279
19280static bool isV2BF16(Type *Ty) {
19282 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
19283}
19284
19285/// \return true if atomicrmw integer ops work for the type.
19286static bool isAtomicRMWLegalIntTy(Type *Ty) {
19287 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
19288 unsigned BW = IT->getBitWidth();
19289 return BW == 32 || BW == 64;
19290 }
19291
19292 return false;
19293}
19294
19295/// \return true if this atomicrmw xchg type can be selected.
19296static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
19297 Type *Ty = RMW->getType();
19298 if (isAtomicRMWLegalIntTy(Ty))
19299 return true;
19300
19301 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
19302 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
19303 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
19304 return BW == 32 || BW == 64;
19305 }
19306
19307 if (Ty->isFloatTy() || Ty->isDoubleTy())
19308 return true;
19309
19311 return VT->getNumElements() == 2 &&
19312 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19313 }
19314
19315 return false;
19316}
19317
19318/// \returns true if it's valid to emit a native instruction for \p RMW, based
19319/// on the properties of the target memory.
19320static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
19321 const AtomicRMWInst *RMW,
19322 bool HasSystemScope) {
19323 // The remote/fine-grained access logic is different from the integer
19324 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
19325 // fine-grained access does not work, even for a device local allocation.
19326 //
19327 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
19328 // allocations work.
19329 if (HasSystemScope) {
19330 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19331 RMW->hasMetadata("amdgpu.no.remote.memory"))
19332 return true;
19333 if (Subtarget.hasEmulatedSystemScopeAtomics())
19334 return true;
19335 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19336 return true;
19337
19338 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
19339}
19340
19341/// \return Action to perform on AtomicRMWInsts for integer operations.
19348
19349/// Return if a flat address space atomicrmw can access private memory.
19351 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
19352 return !MD ||
19354}
19355
19358 // For GAS, lower to flat atomic.
19359 return STI.hasGloballyAddressableScratch()
19362}
19363
19366 unsigned AS = RMW->getPointerAddressSpace();
19367 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
19369
19370 // 64-bit flat atomics that dynamically reside in private memory will silently
19371 // be dropped.
19372 //
19373 // Note that we will emit a new copy of the original atomic in the expansion,
19374 // which will be incrementally relegalized.
19375 const DataLayout &DL = RMW->getFunction()->getDataLayout();
19376 if (AS == AMDGPUAS::FLAT_ADDRESS &&
19377 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
19380
19381 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
19383 ORE.emit([=]() {
19384 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
19385 });
19386 return Kind;
19387 };
19388
19389 auto SSID = RMW->getSyncScopeID();
19390 bool HasSystemScope =
19391 SSID == SyncScope::System ||
19392 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
19393
19394 auto Op = RMW->getOperation();
19395 switch (Op) {
19397 // PCIe supports add and xchg for system atomics.
19398 return isAtomicRMWLegalXChgTy(RMW)
19401 case AtomicRMWInst::Add:
19402 // PCIe supports add and xchg for system atomics.
19404 case AtomicRMWInst::Sub:
19405 case AtomicRMWInst::And:
19406 case AtomicRMWInst::Or:
19407 case AtomicRMWInst::Xor:
19408 case AtomicRMWInst::Max:
19409 case AtomicRMWInst::Min:
19416 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
19418 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
19421 auto *IT = dyn_cast<IntegerType>(RMW->getType());
19422 if (!IT || IT->getBitWidth() != 32)
19424 }
19425
19428 if (Subtarget->hasEmulatedSystemScopeAtomics())
19430
19431 // On most subtargets, for atomicrmw operations other than add/xchg,
19432 // whether or not the instructions will behave correctly depends on where
19433 // the address physically resides and what interconnect is used in the
19434 // system configuration. On some some targets the instruction will nop,
19435 // and in others synchronization will only occur at degraded device scope.
19436 //
19437 // If the allocation is known local to the device, the instructions should
19438 // work correctly.
19439 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
19441
19442 // If fine-grained remote memory works at device scope, we don't need to
19443 // do anything.
19444 if (!HasSystemScope &&
19445 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19447
19448 // If we are targeting a remote allocated address, it depends what kind of
19449 // allocation the address belongs to.
19450 //
19451 // If the allocation is fine-grained (in host memory, or in PCIe peer
19452 // device memory), the operation will fail depending on the target.
19453 //
19454 // Note fine-grained host memory access does work on APUs or if XGMI is
19455 // used, but we do not know if we are targeting an APU or the system
19456 // configuration from the ISA version/target-cpu.
19457 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
19459
19462 // Atomic sub/or/xor do not work over PCI express, but atomic add
19463 // does. InstCombine transforms these with 0 to or, so undo that.
19464 if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
19465 ConstVal && ConstVal->isNullValue())
19467 }
19468
19469 // If the allocation could be in remote, fine-grained memory, the rmw
19470 // instructions may fail. cmpxchg should work, so emit that. On some
19471 // system configurations, PCIe atomics aren't supported so cmpxchg won't
19472 // even work, so you're out of luck anyway.
19473
19474 // In summary:
19475 //
19476 // Cases that may fail:
19477 // - fine-grained pinned host memory
19478 // - fine-grained migratable host memory
19479 // - fine-grained PCIe peer device
19480 //
19481 // Cases that should work, but may be treated overly conservatively.
19482 // - fine-grained host memory on an APU
19483 // - fine-grained XGMI peer device
19485 }
19486
19488 }
19489 case AtomicRMWInst::FAdd: {
19490 Type *Ty = RMW->getType();
19491
19492 // TODO: Handle REGION_ADDRESS
19493 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19494 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
19495 // is fixed to round-to-nearest-even.
19496 //
19497 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
19498 // round-to-nearest-even.
19499 //
19500 // We ignore the rounding mode problem, even in strictfp. The C++ standard
19501 // suggests it is OK if the floating-point mode may not match the calling
19502 // thread.
19503 if (Ty->isFloatTy()) {
19504 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
19506 }
19507
19508 if (Ty->isDoubleTy()) {
19509 // Ignores denormal mode, but we don't consider flushing mandatory.
19510 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
19512 }
19513
19514 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19516
19518 }
19519
19520 // LDS atomics respect the denormal mode from the mode register.
19521 //
19522 // Traditionally f32 global/buffer memory atomics would unconditionally
19523 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
19524 // flush.
19525 //
19526 // On targets with flat atomic fadd, denormals would flush depending on
19527 // whether the target address resides in LDS or global memory. We consider
19528 // this flat-maybe-flush as will-flush.
19529 if (Ty->isFloatTy() &&
19530 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19533
19534 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
19535 // safe. The message phrasing also should be better.
19536 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
19537 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19538 // gfx942, gfx12
19539 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19540 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19541 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
19542 // gfx90a, gfx942, gfx12
19543 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19544 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19545
19546 // gfx942, gfx12
19547 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
19548 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19549 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19550 // gfx90a, gfx942, gfx12
19551 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19552 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19553
19554 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
19555 // buffer. gfx12 does have the buffer version.
19556 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
19557 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19558 }
19559
19560 // global and flat atomic fadd f64: gfx90a, gfx942.
19561 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19562 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19563
19564 if (AS != AMDGPUAS::FLAT_ADDRESS) {
19565 if (Ty->isFloatTy()) {
19566 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
19567 // gfx11+.
19568 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19569 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19570 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
19571 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19572 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19573 } else {
19574 // gfx908
19575 if (RMW->use_empty() &&
19576 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19577 isV2F16(Ty))
19578 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19579 }
19580 }
19581
19582 // flat atomic fadd f32: gfx942, gfx11+.
19583 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
19584 if (Subtarget->hasFlatAtomicFaddF32Inst())
19585 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19586
19587 // If it is in flat address space, and the type is float, we will try to
19588 // expand it, if the target supports global and lds atomic fadd. The
19589 // reason we need that is, in the expansion, we emit the check of
19590 // address space. If it is in global address space, we emit the global
19591 // atomic fadd; if it is in shared address space, we emit the LDS atomic
19592 // fadd.
19593 if (Subtarget->hasLDSFPAtomicAddF32()) {
19594 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19596 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19598 }
19599 }
19600 }
19601
19603 }
19605 case AtomicRMWInst::FMax: {
19606 Type *Ty = RMW->getType();
19607
19608 // LDS float and double fmin/fmax were always supported.
19609 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19610 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
19612 }
19613
19614 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
19615 // For flat and global cases:
19616 // float, double in gfx7. Manual claims denormal support.
19617 // Removed in gfx8.
19618 // float, double restored in gfx10.
19619 // double removed again in gfx11, so only f32 for gfx11/gfx12.
19620 //
19621 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
19622 // no f32.
19623 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19624 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19625 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19626 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19627 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19628 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
19630 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19631 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19632 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19633 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19634 }
19635 }
19636
19638 }
19641 default:
19643 }
19644
19645 llvm_unreachable("covered atomicrmw op switch");
19646}
19647
19654
19661
19664 const AtomicCmpXchgInst *CmpX) const {
19665 unsigned AddrSpace = CmpX->getPointerAddressSpace();
19666 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
19668
19669 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
19671
19672 const DataLayout &DL = CmpX->getDataLayout();
19673
19674 Type *ValTy = CmpX->getNewValOperand()->getType();
19675
19676 // If a 64-bit flat atomic may alias private, we need to avoid using the
19677 // atomic in the private case.
19678 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
19680}
19681
19682const TargetRegisterClass *
19683SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
19685 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19686 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19687 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19688 : &AMDGPU::SReg_32RegClass;
19689 if (!TRI->isSGPRClass(RC) && !isDivergent)
19690 return TRI->getEquivalentSGPRClass(RC);
19691 if (TRI->isSGPRClass(RC) && isDivergent) {
19692 if (Subtarget->hasGFX90AInsts())
19693 return TRI->getEquivalentAVClass(RC);
19694 return TRI->getEquivalentVGPRClass(RC);
19695 }
19696
19697 return RC;
19698}
19699
19700// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19701// uniform values (as produced by the mask results of control flow intrinsics)
19702// used outside of divergent blocks. The phi users need to also be treated as
19703// always uniform.
19704//
19705// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19706static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
19707 unsigned WaveSize) {
19708 // FIXME: We assume we never cast the mask results of a control flow
19709 // intrinsic.
19710 // Early exit if the type won't be consistent as a compile time hack.
19711 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
19712 if (!IT || IT->getBitWidth() != WaveSize)
19713 return false;
19714
19715 if (!isa<Instruction>(V))
19716 return false;
19717 if (!Visited.insert(V).second)
19718 return false;
19719 bool Result = false;
19720 for (const auto *U : V->users()) {
19722 if (V == U->getOperand(1)) {
19723 switch (Intrinsic->getIntrinsicID()) {
19724 default:
19725 Result = false;
19726 break;
19727 case Intrinsic::amdgcn_if_break:
19728 case Intrinsic::amdgcn_if:
19729 case Intrinsic::amdgcn_else:
19730 Result = true;
19731 break;
19732 }
19733 }
19734 if (V == U->getOperand(0)) {
19735 switch (Intrinsic->getIntrinsicID()) {
19736 default:
19737 Result = false;
19738 break;
19739 case Intrinsic::amdgcn_end_cf:
19740 case Intrinsic::amdgcn_loop:
19741 Result = true;
19742 break;
19743 }
19744 }
19745 } else {
19746 Result = hasCFUser(U, Visited, WaveSize);
19747 }
19748 if (Result)
19749 break;
19750 }
19751 return Result;
19752}
19753
19755 const Value *V) const {
19756 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
19757 if (CI->isInlineAsm()) {
19758 // FIXME: This cannot give a correct answer. This should only trigger in
19759 // the case where inline asm returns mixed SGPR and VGPR results, used
19760 // outside the defining block. We don't have a specific result to
19761 // consider, so this assumes if any value is SGPR, the overall register
19762 // also needs to be SGPR.
19763 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19765 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
19766 for (auto &TC : TargetConstraints) {
19767 if (TC.Type == InlineAsm::isOutput) {
19769 const TargetRegisterClass *RC =
19770 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
19771 TC.ConstraintVT)
19772 .second;
19773 if (RC && SIRI->isSGPRClass(RC))
19774 return true;
19775 }
19776 }
19777 }
19778 }
19780 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19781}
19782
19784 for (SDUse &Use : N->uses()) {
19786 if (getBasePtrIndex(M) == Use.getOperandNo())
19787 return true;
19788 }
19789 }
19790 return false;
19791}
19792
19794 SDValue N1) const {
19795 if (!N0.hasOneUse())
19796 return false;
19797 // Take care of the opportunity to keep N0 uniform
19798 if (N0->isDivergent() || !N1->isDivergent())
19799 return true;
19800 // Check if we have a good chance to form the memory access pattern with the
19801 // base and offset
19802 return (DAG.isBaseWithConstantOffset(N0) &&
19804}
19805
19807 Register N0, Register N1) const {
19808 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
19809}
19810
19813 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19815 if (I.getMetadata("amdgpu.noclobber"))
19816 Flags |= MONoClobber;
19817 if (I.getMetadata("amdgpu.last.use"))
19818 Flags |= MOLastUse;
19819 return Flags;
19820}
19821
19823 Instruction *AI) const {
19824 // Given: atomicrmw fadd ptr %addr, float %val ordering
19825 //
19826 // With this expansion we produce the following code:
19827 // [...]
19828 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19829 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19830 //
19831 // atomicrmw.shared:
19832 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19833 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19834 // float %val ordering
19835 // br label %atomicrmw.phi
19836 //
19837 // atomicrmw.check.private:
19838 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19839 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19840 //
19841 // atomicrmw.private:
19842 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19843 // %loaded.private = load float, ptr addrspace(5) %cast.private
19844 // %val.new = fadd float %loaded.private, %val
19845 // store float %val.new, ptr addrspace(5) %cast.private
19846 // br label %atomicrmw.phi
19847 //
19848 // atomicrmw.global:
19849 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19850 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19851 // float %val ordering
19852 // br label %atomicrmw.phi
19853 //
19854 // atomicrmw.phi:
19855 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19856 // [ %loaded.private, %atomicrmw.private ],
19857 // [ %loaded.global, %atomicrmw.global ]
19858 // br label %atomicrmw.end
19859 //
19860 // atomicrmw.end:
19861 // [...]
19862 //
19863 //
19864 // For 64-bit atomics which may reside in private memory, we perform a simpler
19865 // version that only inserts the private check, and uses the flat operation.
19866
19867 IRBuilder<> Builder(AI);
19868 LLVMContext &Ctx = Builder.getContext();
19869
19870 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19871 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19873 Value *Addr = AI->getOperand(PtrOpIdx);
19874
19875 /// TODO: Only need to check private, then emit flat-known-not private (no
19876 /// need for shared block, or cast to global).
19878
19879 Align Alignment;
19880 if (RMW)
19881 Alignment = RMW->getAlign();
19882 else if (CX)
19883 Alignment = CX->getAlign();
19884 else
19885 llvm_unreachable("unhandled atomic operation");
19886
19887 // FullFlatEmulation is true if we need to issue the private, shared, and
19888 // global cases.
19889 //
19890 // If this is false, we are only dealing with the flat-targeting-private case,
19891 // where we only insert a check for private and still use the flat instruction
19892 // for global and shared.
19893
19894 bool FullFlatEmulation =
19895 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19896 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19897 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19898 RMW->getType()->isDoubleTy()));
19899
19900 // If the return value isn't used, do not introduce a false use in the phi.
19901 bool ReturnValueIsUsed = !AI->use_empty();
19902
19903 BasicBlock *BB = Builder.GetInsertBlock();
19904 Function *F = BB->getParent();
19905 BasicBlock *ExitBB =
19906 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19907 BasicBlock *SharedBB = nullptr;
19908
19909 BasicBlock *CheckPrivateBB = BB;
19910 if (FullFlatEmulation) {
19911 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19912 CheckPrivateBB =
19913 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19914 }
19915
19916 BasicBlock *PrivateBB =
19917 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19918 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19919 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19920
19921 std::prev(BB->end())->eraseFromParent();
19922 Builder.SetInsertPoint(BB);
19923
19924 Value *LoadedShared = nullptr;
19925 if (FullFlatEmulation) {
19926 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19927 {Addr}, nullptr, "is.shared");
19928 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19929 Builder.SetInsertPoint(SharedBB);
19930 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19932
19933 Instruction *Clone = AI->clone();
19934 Clone->insertInto(SharedBB, SharedBB->end());
19935 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19936 LoadedShared = Clone;
19937
19938 Builder.CreateBr(PhiBB);
19939 Builder.SetInsertPoint(CheckPrivateBB);
19940 }
19941
19942 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19943 {Addr}, nullptr, "is.private");
19944 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19945
19946 Builder.SetInsertPoint(PrivateBB);
19947
19948 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19950
19951 Value *LoadedPrivate;
19952 if (RMW) {
19953 LoadedPrivate = Builder.CreateAlignedLoad(
19954 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19955
19956 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19957 LoadedPrivate, RMW->getValOperand());
19958
19959 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19960 } else {
19961 auto [ResultLoad, Equal] =
19962 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19963 CX->getNewValOperand(), CX->getAlign());
19964
19965 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19966 ResultLoad, 0);
19967 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19968 }
19969
19970 Builder.CreateBr(PhiBB);
19971
19972 Builder.SetInsertPoint(GlobalBB);
19973
19974 // Continue using a flat instruction if we only emitted the check for private.
19975 Instruction *LoadedGlobal = AI;
19976 if (FullFlatEmulation) {
19977 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19979 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19980 }
19981
19982 AI->removeFromParent();
19983 AI->insertInto(GlobalBB, GlobalBB->end());
19984
19985 // The new atomicrmw may go through another round of legalization later.
19986 if (!FullFlatEmulation) {
19987 // We inserted the runtime check already, make sure we do not try to
19988 // re-expand this.
19989 // TODO: Should union with any existing metadata.
19990 MDBuilder MDB(F->getContext());
19991 MDNode *RangeNotPrivate =
19994 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19995 RangeNotPrivate);
19996 }
19997
19998 Builder.CreateBr(PhiBB);
19999
20000 Builder.SetInsertPoint(PhiBB);
20001
20002 if (ReturnValueIsUsed) {
20003 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
20004 AI->replaceAllUsesWith(Loaded);
20005 if (FullFlatEmulation)
20006 Loaded->addIncoming(LoadedShared, SharedBB);
20007 Loaded->addIncoming(LoadedPrivate, PrivateBB);
20008 Loaded->addIncoming(LoadedGlobal, GlobalBB);
20009 Loaded->takeName(AI);
20010 }
20011
20012 Builder.CreateBr(ExitBB);
20013}
20014
20016 unsigned PtrOpIdx) {
20017 Value *PtrOp = I->getOperand(PtrOpIdx);
20020
20021 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
20022 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
20023 I->getIterator());
20024 I->setOperand(PtrOpIdx, ASCast);
20025}
20026
20029
20032
20035 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
20036 ConstVal && ConstVal->isNullValue()) {
20037 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
20039
20040 // We may still need the private-alias-flat handling below.
20041
20042 // TODO: Skip this for cases where we cannot access remote memory.
20043 }
20044 }
20045
20046 // The non-flat expansions should only perform the de-canonicalization of
20047 // identity values.
20049 return;
20050
20052}
20053
20060
20064
20066 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
20067}
20068
20070 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
20071 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
20072
20074 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
20075}
20076
20077LoadInst *
20079 IRBuilder<> Builder(AI);
20080 auto Order = AI->getOrdering();
20081
20082 // The optimization removes store aspect of the atomicrmw. Therefore, cache
20083 // must be flushed if the atomic ordering had a release semantics. This is
20084 // not necessary a fence, a release fence just coincides to do that flush.
20085 // Avoid replacing of an atomicrmw with a release semantics.
20086 if (isReleaseOrStronger(Order))
20087 return nullptr;
20088
20089 LoadInst *LI = Builder.CreateAlignedLoad(
20090 AI->getType(), AI->getPointerOperand(), AI->getAlign());
20091 LI->setAtomic(Order, AI->getSyncScopeID());
20092 LI->copyMetadata(*AI);
20093 LI->takeName(AI);
20094 AI->replaceAllUsesWith(LI);
20095 AI->eraseFromParent();
20096 return LI;
20097}
static bool isMul(MachineInstr *MI)
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
@ DEFAULT
Default weight is used in cases when there is no dedicated execution weight set.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1269
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1266
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1175
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5976
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1564
bool isNegative() const
Definition APFloat.h:1516
bool isNormal() const
Definition APFloat.h:1520
APInt bitcastToAPInt() const
Definition APFloat.h:1408
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1193
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1153
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1134
bool isInfinity() const
Definition APFloat.h:1513
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1406
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1400
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1654
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:338
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:462
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:216
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:804
Argument * getArg(unsigned i) const
Definition Function.h:886
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool isWave64() const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:133
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:561
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1080
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
unsigned getNumVirtRegs() const
getNumVirtRegs - Return the number of virtual registers created.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:252
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:246
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:249
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:429
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:275
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:110
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:907
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:427
bool use_empty() const
Definition Value.h:347
iterator_range< use_iterator > uses()
Definition Value.h:381
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ ATOMIC_LOAD_USUB_COND
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ GlobalAddress
Definition ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ ATOMIC_LOAD_USUB_SAT
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:975
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:970
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:833
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:237
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:336
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
Definition MathExtras.h:357
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
bool isBoolSGPR(SDValue V)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:203
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ AfterLegalizeTypes
Definition DAGCombine.h:17
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:403
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:308
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:251
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:478
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:420
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:264
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:167
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:178
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:241
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:186
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:363
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:264
static LLVM_ABI std::optional< bool > ule(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_ULE result.
static LLVM_ABI std::optional< bool > uge(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_UGE result.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
std::optional< unsigned > fallbackAddressSpace
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs