LLVM 23.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
42#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
46#include "llvm/IR/MDBuilder.h"
49#include "llvm/Support/ModRef.h"
51#include <optional>
52
53using namespace llvm;
54using namespace llvm::SDPatternMatch;
55
56#define DEBUG_TYPE "si-lower"
57
58STATISTIC(NumTailCalls, "Number of tail calls");
59
60static cl::opt<bool>
61 DisableLoopAlignment("amdgpu-disable-loop-alignment",
62 cl::desc("Do not align and prefetch loops"),
63 cl::init(false));
64
66 "amdgpu-use-divergent-register-indexing", cl::Hidden,
67 cl::desc("Use indirect register addressing for divergent indexes"),
68 cl::init(false));
69
72 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
73}
74
77 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
78}
79
80static unsigned findFirstFreeSGPR(CCState &CCInfo) {
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
83 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
84 return AMDGPU::SGPR0 + Reg;
85 }
86 }
87 llvm_unreachable("Cannot allocate sgpr");
88}
89
91 const GCNSubtarget &STI)
92 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
93 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
94 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
95
96 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V32RegClass =
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
101 addRegisterClass(MVT::f32, V32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const TargetRegisterClass *V64RegClass =
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32,
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32,
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
149
150 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
151 addRegisterClass(MVT::v12f32,
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
153
154 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v16f32,
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
157
158 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
159 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
160
161 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
162 addRegisterClass(MVT::v16f64,
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
167 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
170 } else {
171 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
174 }
175
176 // Unless there are also VOP3P operations, not operations are really legal.
177 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
180 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
183 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
186 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
189 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
191 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
192 }
193
194 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
195 addRegisterClass(MVT::v32f32,
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
197
198 computeRegisterProperties(Subtarget->getRegisterInfo());
199
200 // The boolean content concept here is too inflexible. Compares only ever
201 // really produce a 1-bit result. Any copy/extend from these will turn into a
202 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
203 // it's what most targets use.
206
207 // We need to custom lower vector stores from local memory
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
213 Custom);
214
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
220 Custom);
221
222 if (isTypeLegal(MVT::bf16)) {
223 for (unsigned Opc :
232 ISD::SETCC}) {
233 // FIXME: The promoted to type shouldn't need to be explicit
234 setOperationAction(Opc, MVT::bf16, Promote);
235 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
236 }
237
239
241 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
242
246
247 // We only need to custom lower because we can't specify an action for bf16
248 // sources.
251 }
252
253 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
254 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
259 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
264 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
265 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
266 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
267 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
268 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
269
270 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
271 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
272 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
274 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
275 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
276 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
277
278 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
279 setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
280
284 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
285
286 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
287
289 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
290
292 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
293 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
294
296 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
299 Expand);
301 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
304 Expand);
305
307 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308 MVT::v3i16, MVT::v4i16, MVT::Other},
309 Custom);
310
313 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
314
316
318
320 Expand);
321
322#if 0
324#endif
325
326 // We only support LOAD/STORE and vector manipulation ops for vectors
327 // with > 4 elements.
328 for (MVT VT :
329 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
330 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
331 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
332 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
333 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
334 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
335 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
336 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
337 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
338 switch (Op) {
339 case ISD::LOAD:
340 case ISD::STORE:
342 case ISD::BITCAST:
343 case ISD::UNDEF:
347 case ISD::IS_FPCLASS:
348 break;
353 break;
354 default:
356 break;
357 }
358 }
359 }
360
362
363 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
364 // is expanded to avoid having two separate loops in case the index is a VGPR.
365
366 // Most operations are naturally 32-bit vector operations. We only support
367 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
368 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
370 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
374
376 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
377
379 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
380 }
381
382 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
384 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
388
390 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
391
393 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
394 }
395
396 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
398 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
402
404 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
405
407 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
408 }
409
410 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
412 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
416
418 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
419
421 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
422 }
423
424 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
426 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
430
432 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
433
435 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
436 }
437
439 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
440 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
441 Custom);
442
443 if (Subtarget->hasPkMovB32()) {
444 // TODO: 16-bit element vectors should be legal with even aligned elements.
445 // TODO: Can be legal with wider source types than the result with
446 // subregister extracts.
447 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
448 }
449
451 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
452 // instead lower to cndmask in SITargetLowering::LowerSELECT().
454 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
455 // alignbit.
456 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
457
458 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
459 Custom);
460
461 // Avoid stack access for these.
462 // TODO: Generalize to more vector types.
464 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
465 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
466 Custom);
467
468 // Deal with vec3 vector operations when widened to vec4.
470 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
471
472 // Deal with vec5/6/7 vector operations when widened to vec8.
474 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
475 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
476 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
477 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
478 Custom);
479
480 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
481 // and output demarshalling
482 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
483
484 // We can't return success/failure, only the old value,
485 // let LLVM add the comparison
487 Expand);
488
489 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
490
491 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
492
493 // FIXME: This should be narrowed to i32, but that only happens if i64 is
494 // illegal.
495 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
496 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
497
498 // On SI this is s_memtime and s_memrealtime on VI.
500
501 if (Subtarget->hasSMemRealTime() ||
502 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
505
506 if (Subtarget->has16BitInsts()) {
509 } else {
511 }
512
513 if (Subtarget->hasMadMacF32Insts())
515
518
519 // We only really have 32-bit BFE instructions (and 16-bit on VI).
520 //
521 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
522 // effort to match them now. We want this to be false for i64 cases when the
523 // extraction isn't restricted to the upper or lower half. Ideally we would
524 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
525 // span the midpoint are probably relatively rare, so don't worry about them
526 // for now.
528
529 // Clamp modifier on add/sub
530 if (Subtarget->hasIntClamp())
532
533 if (Subtarget->hasAddNoCarry())
534 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
535 Legal);
536
539 {MVT::f32, MVT::f64}, Custom);
540
541 // These are really only legal for ieee_mode functions. We should be avoiding
542 // them for functions that don't have ieee_mode enabled, so just say they are
543 // legal.
545 {MVT::f32, MVT::f64}, Legal);
546
547 if (Subtarget->haveRoundOpsF64())
549 Legal);
550 else
552 MVT::f64, Custom);
553
555 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
556 Legal);
557 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
558
561
562 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
564
565 // Custom lower these because we can't specify a rule based on an illegal
566 // source bf16.
569
570 if (Subtarget->has16BitInsts()) {
573 MVT::i16, Legal);
574
575 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
576
578 MVT::i16, Expand);
579
583 ISD::CTPOP},
584 MVT::i16, Promote);
585
587
588 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
589
591 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
593 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
594
598
600
601 // F16 - Constant Actions.
604
605 // F16 - Load/Store Actions.
607 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
609 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
610
611 // BF16 - Load/Store Actions.
613 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
615 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
616
617 // F16 - VOP1 Actions.
620 MVT::f16, Custom);
621
622 // BF16 - VOP1 Actions.
623 if (Subtarget->hasBF16TransInsts())
625
628
629 // F16 - VOP2 Actions.
630 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
631 Expand);
635
636 // F16 - VOP3 Actions.
638 if (STI.hasMadF16())
640
641 for (MVT VT :
642 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
643 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
644 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
645 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
646 switch (Op) {
647 case ISD::LOAD:
648 case ISD::STORE:
650 case ISD::BITCAST:
651 case ISD::UNDEF:
656 case ISD::IS_FPCLASS:
657 break;
661 break;
662 default:
664 break;
665 }
666 }
667 }
668
669 // v_perm_b32 can handle either of these.
670 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
672
673 // XXX - Do these do anything? Vector constants turn into build_vector.
674 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
675
676 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
677 Legal);
678
680 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
682 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
683
685 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
687 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
688
689 setOperationAction(ISD::AND, MVT::v2i16, Promote);
690 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
691 setOperationAction(ISD::OR, MVT::v2i16, Promote);
692 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
693 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
694 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
695
697 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
699 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
700 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
701 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
702
704 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
706 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
708 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
709
711 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
713 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
714 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
716
718 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
720 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
721
723 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
725 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
727 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
728
729 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
730 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
731 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
732 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
733 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
734 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
735
737 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
739 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
740 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
741 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
742
743 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
744 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
745 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
746 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
747 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
748 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
749
751 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
753 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
754 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
755 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
756
758 MVT::v2i32, Expand);
760
762 MVT::v4i32, Expand);
763
765 MVT::v8i32, Expand);
766
767 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
768 Subtarget->hasVOP3PInsts() ? Legal : Custom);
769
770 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
771 // This isn't really legal, but this avoids the legalizer unrolling it (and
772 // allows matching fneg (fabs x) patterns)
773 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
774
775 // Can do this in one BFI plus a constant materialize.
777 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
778 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
779 MVT::v32f16, MVT::v32bf16},
780 Custom);
781
784 MVT::f16, Custom);
786
789 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
790 Custom);
791
793 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
794 Expand);
795
796 for (MVT Vec16 :
797 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
798 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
801 Vec16, Custom);
803 }
804 }
805
806 if (Subtarget->hasVOP3PInsts()) {
810 MVT::v2i16, Legal);
811
814 MVT::v2f16, Legal);
815
817 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
818
820 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
821 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
822 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
823 Custom);
824
825 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
826 // Split vector operations.
831 VT, Custom);
832
833 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
834 // Split vector operations.
836 VT, Custom);
837
840 {MVT::v2f16, MVT::v4f16}, Custom);
841
842 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
843 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
844 Custom);
845
846 if (Subtarget->hasBF16PackedInsts()) {
847 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
848 // Split vector operations.
850 VT, Custom);
851 }
852
853 if (Subtarget->hasPackedFP32Ops()) {
855 MVT::v2f32, Legal);
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
858 Custom);
859 }
860 }
861
863
864 if (Subtarget->has16BitInsts()) {
866 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
868 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
869 } else {
870 // Legalization hack.
871 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
872
874 }
875
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
881 Custom);
882
884
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
889
890 if (Subtarget->hasMad64_32())
892
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
895
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
899 } else {
900 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
901 if (Subtarget->hasMinimum3Maximum3F32())
903
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
906
907 // If only the vector form is available, we need to widen to a vector.
908 if (!Subtarget->hasMinimum3Maximum3F16())
910 }
911 }
912
913 if (Subtarget->hasVOP3PInsts()) {
914 // We want to break these into v2f16 pieces, not scalarize.
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
917 Custom);
918 }
919
920 if (Subtarget->hasIntMinMax64())
922 Legal);
923
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
927 MVT::i8},
928 Custom);
929
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
935 Custom);
936
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
942 Custom);
943
949
950 // TODO: Could move this to custom lowering, could benefit from combines on
951 // extract of relevant bits.
953
955
956 if (Subtarget->hasBF16ConversionInsts()) {
957 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
959 }
960
961 if (Subtarget->hasBF16PackedInsts()) {
964 MVT::v2bf16, Legal);
965 }
966
967 if (Subtarget->hasBF16TransInsts()) {
969 }
970
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
974 Custom);
975 }
976
980 ISD::SUB,
982 ISD::MUL,
983 ISD::FADD,
984 ISD::FSUB,
985 ISD::FDIV,
986 ISD::FMUL,
995 ISD::FMA,
996 ISD::SMIN,
997 ISD::SMAX,
998 ISD::UMIN,
999 ISD::UMAX,
1000 ISD::SETCC,
1002 ISD::SMIN,
1003 ISD::SMAX,
1004 ISD::UMIN,
1005 ISD::UMAX,
1006 ISD::AND,
1007 ISD::OR,
1008 ISD::XOR,
1009 ISD::SHL,
1010 ISD::SRL,
1011 ISD::SRA,
1012 ISD::FSHR,
1022
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1025
1026 // All memory operations. Some folding on the pointer operand is done to help
1027 // matching the constant offsets in the addressing modes.
1029 ISD::STORE,
1054
1055 // FIXME: In other contexts we pretend this is a per-function property.
1057
1059}
1060
1061const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1062
1064 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1065 return RCRegs;
1066}
1067
1068//===----------------------------------------------------------------------===//
1069// TargetLowering queries
1070//===----------------------------------------------------------------------===//
1071
1072// v_mad_mix* support a conversion from f16 to f32.
1073//
1074// There is only one special case when denormals are enabled we don't currently,
1075// where this is OK to use.
1076bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1077 EVT DestVT, EVT SrcVT) const {
1078 return DestVT.getScalarType() == MVT::f32 &&
1079 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1080 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1081 SrcVT.getScalarType() == MVT::f16) ||
1082 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1083 SrcVT.getScalarType() == MVT::bf16)) &&
1084 // TODO: This probably only requires no input flushing?
1086}
1087
1089 LLT DestTy, LLT SrcTy) const {
1090 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1091 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1092 DestTy.getScalarSizeInBits() == 32 &&
1093 SrcTy.getScalarSizeInBits() == 16 &&
1094 // TODO: This probably only requires no input flushing?
1095 denormalModeIsFlushAllF32(*MI.getMF());
1096}
1097
1099 // SI has some legal vector types, but no legal vector operations. Say no
1100 // shuffles are legal in order to prefer scalarizing some vector operations.
1101 return false;
1102}
1103
1105 CallingConv::ID CC,
1106 EVT VT) const {
1108 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1109
1110 if (VT.isVector()) {
1111 EVT ScalarVT = VT.getScalarType();
1112 unsigned Size = ScalarVT.getSizeInBits();
1113 if (Size == 16) {
1114 if (Subtarget->has16BitInsts())
1115 return MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
1116 return VT.isInteger() ? MVT::i32 : MVT::f32;
1117 }
1118
1119 if (Size < 16)
1120 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1121 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1122 }
1123
1124 if (VT.getSizeInBits() > 32)
1125 return MVT::i32;
1126
1127 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1128}
1129
1131 CallingConv::ID CC,
1132 EVT VT) const {
1134 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1135
1136 if (VT.isVector()) {
1137 unsigned NumElts = VT.getVectorNumElements();
1138 EVT ScalarVT = VT.getScalarType();
1139 unsigned Size = ScalarVT.getSizeInBits();
1140
1141 // FIXME: Should probably promote 8-bit vectors to i16.
1142 if (Size == 16 && Subtarget->has16BitInsts())
1143 return (NumElts + 1) / 2;
1144
1145 if (Size <= 32)
1146 return NumElts;
1147
1148 if (Size > 32)
1149 return NumElts * ((Size + 31) / 32);
1150 } else if (VT.getSizeInBits() > 32)
1151 return (VT.getSizeInBits() + 31) / 32;
1152
1153 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1154}
1155
1157 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1158 unsigned &NumIntermediates, MVT &RegisterVT) const {
1159 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1160 unsigned NumElts = VT.getVectorNumElements();
1161 EVT ScalarVT = VT.getScalarType();
1162 unsigned Size = ScalarVT.getSizeInBits();
1163 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1164 // support, but unless we can properly handle 3-vectors, it will be still be
1165 // inconsistent.
1166 if (Size == 16 && Subtarget->has16BitInsts()) {
1167 RegisterVT = MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
1168 IntermediateVT = RegisterVT;
1169 NumIntermediates = (NumElts + 1) / 2;
1170 return NumIntermediates;
1171 }
1172
1173 if (Size == 32) {
1174 RegisterVT = ScalarVT.getSimpleVT();
1175 IntermediateVT = RegisterVT;
1176 NumIntermediates = NumElts;
1177 return NumIntermediates;
1178 }
1179
1180 if (Size < 16 && Subtarget->has16BitInsts()) {
1181 // FIXME: Should probably form v2i16 pieces
1182 RegisterVT = MVT::i16;
1183 IntermediateVT = ScalarVT;
1184 NumIntermediates = NumElts;
1185 return NumIntermediates;
1186 }
1187
1188 if (Size != 16 && Size <= 32) {
1189 RegisterVT = MVT::i32;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1193 }
1194
1195 if (Size > 32) {
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = RegisterVT;
1198 NumIntermediates = NumElts * ((Size + 31) / 32);
1199 return NumIntermediates;
1200 }
1201 }
1202
1204 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1205}
1206
1208 const DataLayout &DL, Type *Ty,
1209 unsigned MaxNumLanes) {
1210 assert(MaxNumLanes != 0);
1211
1212 LLVMContext &Ctx = Ty->getContext();
1213 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1214 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1215 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1216 NumElts);
1217 }
1218
1219 return TLI.getValueType(DL, Ty);
1220}
1221
1222// Peek through TFE struct returns to only use the data size.
1224 const DataLayout &DL, Type *Ty,
1225 unsigned MaxNumLanes) {
1226 auto *ST = dyn_cast<StructType>(Ty);
1227 if (!ST)
1228 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1229
1230 // TFE intrinsics return an aggregate type.
1231 assert(ST->getNumContainedTypes() == 2 &&
1232 ST->getContainedType(1)->isIntegerTy(32));
1233 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1234}
1235
1236/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1237/// in-memory representation. This return value is a custom type because there
1238/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1239/// could cause issues during codegen, these address space 7 pointers will be
1240/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1241/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1242/// for cost modeling, to work. (This also sets us up decently for doing the
1243/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1245 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1246 return MVT::amdgpuBufferFatPointer;
1248 DL.getPointerSizeInBits(AS) == 192)
1249 return MVT::amdgpuBufferStridedPointer;
1251}
1252/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1253/// v8i32 when padding is added.
1254/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1255/// also v8i32 with padding.
1257 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1258 DL.getPointerSizeInBits(AS) == 160) ||
1260 DL.getPointerSizeInBits(AS) == 192))
1261 return MVT::v8i32;
1263}
1264
1265static unsigned getIntrMemWidth(unsigned IntrID) {
1266 switch (IntrID) {
1267 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1268 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1269 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1270 return 8;
1271 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1272 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1273 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1274 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1275 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1276 return 32;
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1282 return 64;
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1288 return 128;
1289 default:
1290 llvm_unreachable("Unknown width");
1291 }
1292}
1293
1294static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad,
1296 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1297 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1298 switch (AtomicOrderingCABI(Ord)) {
1301 break;
1304 break;
1307 break;
1308 default:
1310 break;
1311 }
1312
1313 Info.flags =
1315 Info.flags |= MOCooperative;
1316
1317 MDNode *ScopeMD = cast<MDNode>(
1318 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1319 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1320 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1321}
1322
1324 const CallBase &CI,
1325 MachineFunction &MF,
1326 unsigned IntrID) const {
1327 Info.flags = MachineMemOperand::MONone;
1328 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1329 Info.flags |= MachineMemOperand::MOInvariant;
1330 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1332 Info.flags |= getTargetMMOFlags(CI);
1333
1334 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1336 AttributeSet Attr =
1338 MemoryEffects ME = Attr.getMemoryEffects();
1339 if (ME.doesNotAccessMemory())
1340 return false;
1341
1342 // TODO: Should images get their own address space?
1343 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1344
1345 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1346 if (RsrcIntr->IsImage) {
1347 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1349 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1350 Info.align.reset();
1351 }
1352
1353 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1354 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1355 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1356 // We conservatively set the memory operand of a buffer intrinsic to the
1357 // base resource pointer, so that we can access alias information about
1358 // those pointers. Cases like "this points at the same value
1359 // but with a different offset" are handled in
1360 // areMemAccessesTriviallyDisjoint.
1361 Info.ptrVal = RsrcArg;
1362 }
1363
1364 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1365 if (!IsSPrefetch) {
1366 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1367 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1368 Info.flags |= MachineMemOperand::MOVolatile;
1369 }
1370
1372 if (ME.onlyReadsMemory()) {
1373 if (RsrcIntr->IsImage) {
1374 unsigned MaxNumLanes = 4;
1375
1376 if (!BaseOpcode->Gather4) {
1377 // If this isn't a gather, we may have excess loaded elements in the
1378 // IR type. Check the dmask for the real number of elements loaded.
1379 unsigned DMask =
1380 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1381 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1382 }
1383
1384 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1385 CI.getType(), MaxNumLanes);
1386 } else {
1387 Info.memVT =
1389 std::numeric_limits<unsigned>::max());
1390 }
1391
1392 // FIXME: What does alignment mean for an image?
1393 Info.opc = ISD::INTRINSIC_W_CHAIN;
1394 Info.flags |= MachineMemOperand::MOLoad;
1395 } else if (ME.onlyWritesMemory()) {
1396 Info.opc = ISD::INTRINSIC_VOID;
1397
1398 Type *DataTy = CI.getArgOperand(0)->getType();
1399 if (RsrcIntr->IsImage) {
1400 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1401 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1402 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1403 DMaskLanes);
1404 } else
1405 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1406
1407 Info.flags |= MachineMemOperand::MOStore;
1408 } else {
1409 // Atomic, NoReturn Sampler or prefetch
1410 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1412 Info.flags |=
1414
1415 if (!IsSPrefetch)
1416 Info.flags |= MachineMemOperand::MOStore;
1417
1418 switch (IntrID) {
1419 default:
1420 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1421 // Fake memory access type for no return sampler intrinsics
1422 Info.memVT = MVT::i32;
1423 } else {
1424 // XXX - Should this be volatile without known ordering?
1425 Info.flags |= MachineMemOperand::MOVolatile;
1426 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1427 }
1428 break;
1429 case Intrinsic::amdgcn_raw_buffer_load_lds:
1430 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1431 case Intrinsic::amdgcn_struct_buffer_load_lds:
1432 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1433 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1434 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1435 Info.ptrVal = CI.getArgOperand(1);
1436 return true;
1437 }
1438 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1439 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1440 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1441 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1442 Info.memVT =
1444 std::numeric_limits<unsigned>::max());
1445 Info.flags &= ~MachineMemOperand::MOStore;
1446 return true;
1447 }
1448 }
1449 }
1450 return true;
1451 }
1452
1453 switch (IntrID) {
1454 case Intrinsic::amdgcn_ds_ordered_add:
1455 case Intrinsic::amdgcn_ds_ordered_swap: {
1456 Info.opc = ISD::INTRINSIC_W_CHAIN;
1457 Info.memVT = MVT::getVT(CI.getType());
1458 Info.ptrVal = CI.getOperand(0);
1459 Info.align.reset();
1461
1462 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1463 if (!Vol->isZero())
1464 Info.flags |= MachineMemOperand::MOVolatile;
1465
1466 return true;
1467 }
1468 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1469 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1470 Info.opc = ISD::INTRINSIC_W_CHAIN;
1471 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1472 Info.ptrVal = nullptr;
1473 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1475 return true;
1476 }
1477 case Intrinsic::amdgcn_ds_append:
1478 case Intrinsic::amdgcn_ds_consume: {
1479 Info.opc = ISD::INTRINSIC_W_CHAIN;
1480 Info.memVT = MVT::getVT(CI.getType());
1481 Info.ptrVal = CI.getOperand(0);
1482 Info.align.reset();
1484
1485 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1486 if (!Vol->isZero())
1487 Info.flags |= MachineMemOperand::MOVolatile;
1488
1489 return true;
1490 }
1491 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1492 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1493 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1496 Info.memVT = MVT::getVT(CI.getType());
1497 Info.ptrVal = CI.getOperand(0);
1498 Info.memVT = MVT::i64;
1499 Info.size = 8;
1500 Info.align.reset();
1502 return true;
1503 }
1504 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1505 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1506 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1507 Info.opc = ISD::INTRINSIC_W_CHAIN;
1508 Info.memVT =
1509 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1510 ? CI.getType()
1512 ->getElementType(0)); // XXX: what is correct VT?
1513
1514 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1515 Info.align.reset();
1516 Info.flags |=
1518 return true;
1519 }
1520 case Intrinsic::amdgcn_global_atomic_fmin_num:
1521 case Intrinsic::amdgcn_global_atomic_fmax_num:
1522 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1523 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1524 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1525 Info.opc = ISD::INTRINSIC_W_CHAIN;
1526 Info.memVT = MVT::getVT(CI.getType());
1527 Info.ptrVal = CI.getOperand(0);
1528 Info.align.reset();
1532 return true;
1533 }
1534 case Intrinsic::amdgcn_flat_load_monitor_b32:
1535 case Intrinsic::amdgcn_flat_load_monitor_b64:
1536 case Intrinsic::amdgcn_flat_load_monitor_b128:
1537 case Intrinsic::amdgcn_global_load_monitor_b32:
1538 case Intrinsic::amdgcn_global_load_monitor_b64:
1539 case Intrinsic::amdgcn_global_load_monitor_b128:
1540 case Intrinsic::amdgcn_cluster_load_b32:
1541 case Intrinsic::amdgcn_cluster_load_b64:
1542 case Intrinsic::amdgcn_cluster_load_b128:
1543 case Intrinsic::amdgcn_ds_load_tr6_b96:
1544 case Intrinsic::amdgcn_ds_load_tr4_b64:
1545 case Intrinsic::amdgcn_ds_load_tr8_b64:
1546 case Intrinsic::amdgcn_ds_load_tr16_b128:
1547 case Intrinsic::amdgcn_global_load_tr6_b96:
1548 case Intrinsic::amdgcn_global_load_tr4_b64:
1549 case Intrinsic::amdgcn_global_load_tr_b64:
1550 case Intrinsic::amdgcn_global_load_tr_b128:
1551 case Intrinsic::amdgcn_ds_read_tr4_b64:
1552 case Intrinsic::amdgcn_ds_read_tr6_b96:
1553 case Intrinsic::amdgcn_ds_read_tr8_b64:
1554 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1555 Info.opc = ISD::INTRINSIC_W_CHAIN;
1556 Info.memVT = MVT::getVT(CI.getType());
1557 Info.ptrVal = CI.getOperand(0);
1558 Info.align.reset();
1559 Info.flags |= MachineMemOperand::MOLoad;
1560 return true;
1561 }
1562 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1563 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1564 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1565 Info.opc = ISD::INTRINSIC_W_CHAIN;
1566 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1567 Info.ptrVal = CI.getOperand(0);
1568 Info.align.reset();
1569 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1570 return true;
1571 }
1572 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1573 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1574 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1575 Info.opc = ISD::INTRINSIC_VOID;
1576 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1577 Info.ptrVal = CI.getArgOperand(0);
1578 Info.align.reset();
1579 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1580 return true;
1581 }
1582 case Intrinsic::amdgcn_ds_gws_init:
1583 case Intrinsic::amdgcn_ds_gws_barrier:
1584 case Intrinsic::amdgcn_ds_gws_sema_v:
1585 case Intrinsic::amdgcn_ds_gws_sema_br:
1586 case Intrinsic::amdgcn_ds_gws_sema_p:
1587 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1588 Info.opc = ISD::INTRINSIC_VOID;
1589
1590 const GCNTargetMachine &TM =
1591 static_cast<const GCNTargetMachine &>(getTargetMachine());
1592
1594 Info.ptrVal = MFI->getGWSPSV(TM);
1595
1596 // This is an abstract access, but we need to specify a type and size.
1597 Info.memVT = MVT::i32;
1598 Info.size = 4;
1599 Info.align = Align(4);
1600
1601 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1602 Info.flags |= MachineMemOperand::MOLoad;
1603 else
1604 Info.flags |= MachineMemOperand::MOStore;
1605 return true;
1606 }
1607 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1608 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1609 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1610 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1611 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1612 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1613 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1614 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1615 Info.opc = ISD::INTRINSIC_VOID;
1616 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1617 Info.ptrVal = CI.getArgOperand(1);
1619 return true;
1620 }
1621 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1622 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1623 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1624 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1625 Info.opc = ISD::INTRINSIC_VOID;
1626 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1627 Info.ptrVal = CI.getArgOperand(0);
1629 return true;
1630 }
1631 case Intrinsic::amdgcn_load_to_lds:
1632 case Intrinsic::amdgcn_global_load_lds: {
1633 Info.opc = ISD::INTRINSIC_VOID;
1634 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1635 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1636 Info.ptrVal = CI.getArgOperand(1);
1638 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1639 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1640 Info.flags |= MachineMemOperand::MOVolatile;
1641 return true;
1642 }
1643 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1644 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1645 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1646 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1647 Info.opc = ISD::INTRINSIC_W_CHAIN;
1648
1649 const GCNTargetMachine &TM =
1650 static_cast<const GCNTargetMachine &>(getTargetMachine());
1651
1653 Info.ptrVal = MFI->getGWSPSV(TM);
1654
1655 // This is an abstract access, but we need to specify a type and size.
1656 Info.memVT = MVT::i32;
1657 Info.size = 4;
1658 Info.align = Align(4);
1659
1661 return true;
1662 }
1663 case Intrinsic::amdgcn_s_prefetch_data:
1664 case Intrinsic::amdgcn_flat_prefetch:
1665 case Intrinsic::amdgcn_global_prefetch: {
1666 Info.opc = ISD::INTRINSIC_VOID;
1667 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1668 Info.ptrVal = CI.getArgOperand(0);
1669 Info.flags |= MachineMemOperand::MOLoad;
1670 return true;
1671 }
1672 default:
1673 return false;
1674 }
1675}
1676
1678 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1680 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1681 // The DAG's ValueType loses the addrspaces.
1682 // Add them as 2 extra Constant operands "from" and "to".
1683 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1684 unsigned DstAS = I.getType()->getPointerAddressSpace();
1685 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1686 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1687 break;
1688 }
1689 default:
1690 break;
1691 }
1692}
1693
1696 Type *&AccessTy) const {
1697 Value *Ptr = nullptr;
1698 switch (II->getIntrinsicID()) {
1699 case Intrinsic::amdgcn_cluster_load_b128:
1700 case Intrinsic::amdgcn_cluster_load_b64:
1701 case Intrinsic::amdgcn_cluster_load_b32:
1702 case Intrinsic::amdgcn_ds_append:
1703 case Intrinsic::amdgcn_ds_consume:
1704 case Intrinsic::amdgcn_ds_load_tr8_b64:
1705 case Intrinsic::amdgcn_ds_load_tr16_b128:
1706 case Intrinsic::amdgcn_ds_load_tr4_b64:
1707 case Intrinsic::amdgcn_ds_load_tr6_b96:
1708 case Intrinsic::amdgcn_ds_read_tr4_b64:
1709 case Intrinsic::amdgcn_ds_read_tr6_b96:
1710 case Intrinsic::amdgcn_ds_read_tr8_b64:
1711 case Intrinsic::amdgcn_ds_read_tr16_b64:
1712 case Intrinsic::amdgcn_ds_ordered_add:
1713 case Intrinsic::amdgcn_ds_ordered_swap:
1714 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1715 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1716 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1717 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1718 case Intrinsic::amdgcn_flat_load_monitor_b128:
1719 case Intrinsic::amdgcn_flat_load_monitor_b32:
1720 case Intrinsic::amdgcn_flat_load_monitor_b64:
1721 case Intrinsic::amdgcn_global_atomic_fmax_num:
1722 case Intrinsic::amdgcn_global_atomic_fmin_num:
1723 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1724 case Intrinsic::amdgcn_global_load_monitor_b128:
1725 case Intrinsic::amdgcn_global_load_monitor_b32:
1726 case Intrinsic::amdgcn_global_load_monitor_b64:
1727 case Intrinsic::amdgcn_global_load_tr_b64:
1728 case Intrinsic::amdgcn_global_load_tr_b128:
1729 case Intrinsic::amdgcn_global_load_tr4_b64:
1730 case Intrinsic::amdgcn_global_load_tr6_b96:
1731 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1732 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1733 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1734 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1735 Ptr = II->getArgOperand(0);
1736 break;
1737 case Intrinsic::amdgcn_load_to_lds:
1738 case Intrinsic::amdgcn_global_load_lds:
1739 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1740 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1741 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1742 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1743 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1744 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1745 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1746 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1747 Ptr = II->getArgOperand(1);
1748 break;
1749 default:
1750 return false;
1751 }
1752 AccessTy = II->getType();
1753 Ops.push_back(Ptr);
1754 return true;
1755}
1756
1758 unsigned AddrSpace) const {
1759 if (!Subtarget->hasFlatInstOffsets()) {
1760 // Flat instructions do not have offsets, and only have the register
1761 // address.
1762 return AM.BaseOffs == 0 && AM.Scale == 0;
1763 }
1764
1765 decltype(SIInstrFlags::FLAT) FlatVariant =
1769
1770 return AM.Scale == 0 &&
1771 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1772 AM.BaseOffs, AddrSpace, FlatVariant));
1773}
1774
1776 if (Subtarget->hasFlatGlobalInsts())
1778
1779 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1780 // Assume the we will use FLAT for all global memory accesses
1781 // on VI.
1782 // FIXME: This assumption is currently wrong. On VI we still use
1783 // MUBUF instructions for the r + i addressing mode. As currently
1784 // implemented, the MUBUF instructions only work on buffer < 4GB.
1785 // It may be possible to support > 4GB buffers with MUBUF instructions,
1786 // by setting the stride value in the resource descriptor which would
1787 // increase the size limit to (stride * 4GB). However, this is risky,
1788 // because it has never been validated.
1790 }
1791
1792 return isLegalMUBUFAddressingMode(AM);
1793}
1794
1795bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1796 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1797 // additionally can do r + r + i with addr64. 32-bit has more addressing
1798 // mode options. Depending on the resource constant, it can also do
1799 // (i64 r0) + (i32 r1) * (i14 i).
1800 //
1801 // Private arrays end up using a scratch buffer most of the time, so also
1802 // assume those use MUBUF instructions. Scratch loads / stores are currently
1803 // implemented as mubuf instructions with offen bit set, so slightly
1804 // different than the normal addr64.
1805 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1806 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1807 return false;
1808
1809 // FIXME: Since we can split immediate into soffset and immediate offset,
1810 // would it make sense to allow any immediate?
1811
1812 switch (AM.Scale) {
1813 case 0: // r + i or just i, depending on HasBaseReg.
1814 return true;
1815 case 1:
1816 return true; // We have r + r or r + i.
1817 case 2:
1818 if (AM.HasBaseReg) {
1819 // Reject 2 * r + r.
1820 return false;
1821 }
1822
1823 // Allow 2 * r as r + r
1824 // Or 2 * r + i is allowed as r + r + i.
1825 return true;
1826 default: // Don't allow n * r
1827 return false;
1828 }
1829}
1830
1832 const AddrMode &AM, Type *Ty,
1833 unsigned AS,
1834 Instruction *I) const {
1835 // No global is ever allowed as a base.
1836 if (AM.BaseGV)
1837 return false;
1838
1839 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1840 return isLegalGlobalAddressingMode(AM);
1841
1842 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1846 // If the offset isn't a multiple of 4, it probably isn't going to be
1847 // correctly aligned.
1848 // FIXME: Can we get the real alignment here?
1849 if (AM.BaseOffs % 4 != 0)
1850 return isLegalMUBUFAddressingMode(AM);
1851
1852 if (!Subtarget->hasScalarSubwordLoads()) {
1853 // There are no SMRD extloads, so if we have to do a small type access we
1854 // will use a MUBUF load.
1855 // FIXME?: We also need to do this if unaligned, but we don't know the
1856 // alignment here.
1857 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1858 return isLegalGlobalAddressingMode(AM);
1859 }
1860
1861 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1862 // SMRD instructions have an 8-bit, dword offset on SI.
1863 if (!isUInt<8>(AM.BaseOffs / 4))
1864 return false;
1865 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1866 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1867 // in 8-bits, it can use a smaller encoding.
1868 if (!isUInt<32>(AM.BaseOffs / 4))
1869 return false;
1870 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1871 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1872 if (!isUInt<20>(AM.BaseOffs))
1873 return false;
1874 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1875 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1876 // for S_BUFFER_* instructions).
1877 if (!isInt<21>(AM.BaseOffs))
1878 return false;
1879 } else {
1880 // On GFX12, all offsets are signed 24-bit in bytes.
1881 if (!isInt<24>(AM.BaseOffs))
1882 return false;
1883 }
1884
1885 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1887 AM.BaseOffs < 0) {
1888 // Scalar (non-buffer) loads can only use a negative offset if
1889 // soffset+offset is non-negative. Since the compiler can only prove that
1890 // in a few special cases, it is safer to claim that negative offsets are
1891 // not supported.
1892 return false;
1893 }
1894
1895 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1896 return true;
1897
1898 if (AM.Scale == 1 && AM.HasBaseReg)
1899 return true;
1900
1901 return false;
1902 }
1903
1904 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1905 return Subtarget->enableFlatScratch()
1907 : isLegalMUBUFAddressingMode(AM);
1908
1909 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1910 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1911 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1912 // field.
1913 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1914 // an 8-bit dword offset but we don't know the alignment here.
1915 if (!isUInt<16>(AM.BaseOffs))
1916 return false;
1917
1918 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1919 return true;
1920
1921 if (AM.Scale == 1 && AM.HasBaseReg)
1922 return true;
1923
1924 return false;
1925 }
1926
1928 // For an unknown address space, this usually means that this is for some
1929 // reason being used for pure arithmetic, and not based on some addressing
1930 // computation. We don't have instructions that compute pointers with any
1931 // addressing modes, so treat them as having no offset like flat
1932 // instructions.
1934 }
1935
1936 // Assume a user alias of global for unknown address spaces.
1937 return isLegalGlobalAddressingMode(AM);
1938}
1939
1941 const MachineFunction &MF) const {
1943 return (MemVT.getSizeInBits() <= 4 * 32);
1944 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1945 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1946 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1947 }
1949 return (MemVT.getSizeInBits() <= 2 * 32);
1950 return true;
1951}
1952
1954 unsigned Size, unsigned AddrSpace, Align Alignment,
1955 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1956 if (IsFast)
1957 *IsFast = 0;
1958
1959 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1960 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1961 // Check if alignment requirements for ds_read/write instructions are
1962 // disabled.
1963 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1964 return false;
1965
1966 Align RequiredAlignment(
1967 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1968 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1969 Alignment < RequiredAlignment)
1970 return false;
1971
1972 // Either, the alignment requirements are "enabled", or there is an
1973 // unaligned LDS access related hardware bug though alignment requirements
1974 // are "disabled". In either case, we need to check for proper alignment
1975 // requirements.
1976 //
1977 switch (Size) {
1978 case 64:
1979 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1980 // address is negative, then the instruction is incorrectly treated as
1981 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1982 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1983 // load later in the SILoadStoreOptimizer.
1984 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1985 return false;
1986
1987 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1988 // can do a 4 byte aligned, 8 byte access in a single operation using
1989 // ds_read2/write2_b32 with adjacent offsets.
1990 RequiredAlignment = Align(4);
1991
1992 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1993 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1994 // ds_write2_b32 depending on the alignment. In either case with either
1995 // alignment there is no faster way of doing this.
1996
1997 // The numbers returned here and below are not additive, it is a 'speed
1998 // rank'. They are just meant to be compared to decide if a certain way
1999 // of lowering an operation is faster than another. For that purpose
2000 // naturally aligned operation gets it bitsize to indicate that "it
2001 // operates with a speed comparable to N-bit wide load". With the full
2002 // alignment ds128 is slower than ds96 for example. If underaligned it
2003 // is comparable to a speed of a single dword access, which would then
2004 // mean 32 < 128 and it is faster to issue a wide load regardless.
2005 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2006 // wider load which will not be aligned anymore the latter is slower.
2007 if (IsFast)
2008 *IsFast = (Alignment >= RequiredAlignment) ? 64
2009 : (Alignment < Align(4)) ? 32
2010 : 1;
2011 return true;
2012 }
2013
2014 break;
2015 case 96:
2016 if (!Subtarget->hasDS96AndDS128())
2017 return false;
2018
2019 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2020 // gfx8 and older.
2021
2022 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2023 // Naturally aligned access is fastest. However, also report it is Fast
2024 // if memory is aligned less than DWORD. A narrow load or store will be
2025 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2026 // be more of them, so overall we will pay less penalty issuing a single
2027 // instruction.
2028
2029 // See comment on the values above.
2030 if (IsFast)
2031 *IsFast = (Alignment >= RequiredAlignment) ? 96
2032 : (Alignment < Align(4)) ? 32
2033 : 1;
2034 return true;
2035 }
2036
2037 break;
2038 case 128:
2039 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2040 return false;
2041
2042 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2043 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2044 // single operation using ds_read2/write2_b64.
2045 RequiredAlignment = Align(8);
2046
2047 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2048 // Naturally aligned access is fastest. However, also report it is Fast
2049 // if memory is aligned less than DWORD. A narrow load or store will be
2050 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2051 // will be more of them, so overall we will pay less penalty issuing a
2052 // single instruction.
2053
2054 // See comment on the values above.
2055 if (IsFast)
2056 *IsFast = (Alignment >= RequiredAlignment) ? 128
2057 : (Alignment < Align(4)) ? 32
2058 : 1;
2059 return true;
2060 }
2061
2062 break;
2063 default:
2064 if (Size > 32)
2065 return false;
2066
2067 break;
2068 }
2069
2070 // See comment on the values above.
2071 // Note that we have a single-dword or sub-dword here, so if underaligned
2072 // it is a slowest possible access, hence returned value is 0.
2073 if (IsFast)
2074 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2075
2076 return Alignment >= RequiredAlignment ||
2077 Subtarget->hasUnalignedDSAccessEnabled();
2078 }
2079
2080 // FIXME: We have to be conservative here and assume that flat operations
2081 // will access scratch. If we had access to the IR function, then we
2082 // could determine if any private memory was used in the function.
2083 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2084 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2085 bool AlignedBy4 = Alignment >= Align(4);
2086 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2087 if (IsFast)
2088 *IsFast = AlignedBy4 ? Size : 1;
2089 return true;
2090 }
2091
2092 if (IsFast)
2093 *IsFast = AlignedBy4;
2094
2095 return AlignedBy4;
2096 }
2097
2098 // So long as they are correct, wide global memory operations perform better
2099 // than multiple smaller memory ops -- even when misaligned
2100 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2101 if (IsFast)
2102 *IsFast = Size;
2103
2104 return Alignment >= Align(4) ||
2105 Subtarget->hasUnalignedBufferAccessEnabled();
2106 }
2107
2108 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2109 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2110 // out-of-bounds behavior, but in the edge case where an access starts
2111 // out-of-bounds and then enter in-bounds, the entire access would be treated
2112 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2113 // natural alignment of buffer accesses.
2114 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2115 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2116 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2117 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2118 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2119 return false;
2120 }
2121
2122 // Smaller than dword value must be aligned.
2123 if (Size < 32)
2124 return false;
2125
2126 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2127 // byte-address are ignored, thus forcing Dword alignment.
2128 // This applies to private, global, and constant memory.
2129 if (IsFast)
2130 *IsFast = 1;
2131
2132 return Size >= 32 && Alignment >= Align(4);
2133}
2134
2136 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2137 unsigned *IsFast) const {
2139 Alignment, Flags, IsFast);
2140}
2141
2143 LLVMContext &Context, const MemOp &Op,
2144 const AttributeList &FuncAttributes) const {
2145 // FIXME: Should account for address space here.
2146
2147 // The default fallback uses the private pointer size as a guess for a type to
2148 // use. Make sure we switch these to 64-bit accesses.
2149
2150 if (Op.size() >= 16 &&
2151 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2152 return MVT::v4i32;
2153
2154 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2155 return MVT::v2i32;
2156
2157 // Use the default.
2158 return MVT::Other;
2159}
2160
2162 const MemSDNode *MemNode = cast<MemSDNode>(N);
2163 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2164}
2165
2170
2172 unsigned DestAS) const {
2173 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2174 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2175 Subtarget->hasGloballyAddressableScratch()) {
2176 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2177 return false;
2178 }
2179
2180 // Flat -> private/local is a simple truncate.
2181 // Flat -> global is no-op
2182 return true;
2183 }
2184
2185 const GCNTargetMachine &TM =
2186 static_cast<const GCNTargetMachine &>(getTargetMachine());
2187 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2188}
2189
2197
2199 Type *Ty) const {
2200 // FIXME: Could be smarter if called for vector constants.
2201 return true;
2202}
2203
2205 unsigned Index) const {
2207 return false;
2208
2209 // TODO: Add more cases that are cheap.
2210 return Index == 0;
2211}
2212
2213bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2214 // TODO: This should be more aggressive, particular for 16-bit element
2215 // vectors. However there are some mixed improvements and regressions.
2216 EVT EltTy = VT.getVectorElementType();
2217 return EltTy.getSizeInBits() % 32 == 0;
2218}
2219
2221 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2222 switch (Op) {
2223 case ISD::LOAD:
2224 case ISD::STORE:
2225 return true;
2226 default:
2227 return false;
2228 }
2229 }
2230
2231 // SimplifySetCC uses this function to determine whether or not it should
2232 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2233 if (VT == MVT::i1 && Op == ISD::SETCC)
2234 return false;
2235
2237}
2238
2241 // This isn't really a constant pool but close enough.
2244 return PtrInfo;
2245}
2246
2247SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2248 const SDLoc &SL,
2249 SDValue Chain,
2250 uint64_t Offset) const {
2251 const DataLayout &DL = DAG.getDataLayout();
2255
2256 auto [InputPtrReg, RC, ArgTy] =
2257 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2258
2259 // We may not have the kernarg segment argument if we have no kernel
2260 // arguments.
2261 if (!InputPtrReg)
2262 return DAG.getConstant(Offset, SL, PtrVT);
2263
2265 SDValue BasePtr = DAG.getCopyFromReg(
2266 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2267
2268 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2269}
2270
2271SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2272 const SDLoc &SL) const {
2275 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2276}
2277
2278SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2279 const SDLoc &SL) const {
2280
2282 std::optional<uint32_t> KnownSize =
2284 if (KnownSize.has_value())
2285 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2286 return SDValue();
2287}
2288
2289SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2290 const SDLoc &SL, SDValue Val,
2291 bool Signed,
2292 const ISD::InputArg *Arg) const {
2293 // First, if it is a widened vector, narrow it.
2294 if (VT.isVector() &&
2296 EVT NarrowedVT =
2299 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2300 DAG.getConstant(0, SL, MVT::i32));
2301 }
2302
2303 // Then convert the vector elements or scalar value.
2304 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2305 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2306 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2307 }
2308
2309 if (MemVT.isFloatingPoint())
2310 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2311 else if (Signed)
2312 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2313 else
2314 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2315
2316 return Val;
2317}
2318
2319SDValue SITargetLowering::lowerKernargMemParameter(
2320 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2321 uint64_t Offset, Align Alignment, bool Signed,
2322 const ISD::InputArg *Arg) const {
2323
2324 MachinePointerInfo PtrInfo =
2326
2327 // Try to avoid using an extload by loading earlier than the argument address,
2328 // and extracting the relevant bits. The load should hopefully be merged with
2329 // the previous argument.
2330 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2331 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2332 int64_t AlignDownOffset = alignDown(Offset, 4);
2333 int64_t OffsetDiff = Offset - AlignDownOffset;
2334
2335 EVT IntVT = MemVT.changeTypeToInteger();
2336
2337 // TODO: If we passed in the base kernel offset we could have a better
2338 // alignment than 4, but we don't really need it.
2339 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2340 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2341 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2344
2345 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2346 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2347
2348 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2349 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2350 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2351
2352 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2353 }
2354
2355 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2356 SDValue Load = DAG.getLoad(
2357 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2359
2360 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2361 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2362}
2363
2364/// Coerce an argument which was passed in a different ABI type to the original
2365/// expected value type.
2366SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2367 SDValue Val,
2368 CCValAssign &VA,
2369 const SDLoc &SL) const {
2370 EVT ValVT = VA.getValVT();
2371
2372 // If this is an 8 or 16-bit value, it is really passed promoted
2373 // to 32 bits. Insert an assert[sz]ext to capture this, then
2374 // truncate to the right size.
2375 switch (VA.getLocInfo()) {
2376 case CCValAssign::Full:
2377 return Val;
2378 case CCValAssign::BCvt:
2379 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2380 case CCValAssign::SExt:
2381 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2382 DAG.getValueType(ValVT));
2383 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2384 case CCValAssign::ZExt:
2385 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2386 DAG.getValueType(ValVT));
2387 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2388 case CCValAssign::AExt:
2389 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2390 default:
2391 llvm_unreachable("Unknown loc info!");
2392 }
2393}
2394
2395SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2396 CCValAssign &VA, const SDLoc &SL,
2397 SDValue Chain,
2398 const ISD::InputArg &Arg) const {
2399 MachineFunction &MF = DAG.getMachineFunction();
2400 MachineFrameInfo &MFI = MF.getFrameInfo();
2401
2402 if (Arg.Flags.isByVal()) {
2403 unsigned Size = Arg.Flags.getByValSize();
2404 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2405 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2406 }
2407
2408 unsigned ArgOffset = VA.getLocMemOffset();
2409 unsigned ArgSize = VA.getValVT().getStoreSize();
2410
2411 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2412
2413 // Create load nodes to retrieve arguments from the stack.
2414 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2415
2416 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2418 MVT MemVT = VA.getValVT();
2419
2420 switch (VA.getLocInfo()) {
2421 default:
2422 break;
2423 case CCValAssign::BCvt:
2424 MemVT = VA.getLocVT();
2425 break;
2426 case CCValAssign::SExt:
2427 ExtType = ISD::SEXTLOAD;
2428 break;
2429 case CCValAssign::ZExt:
2430 ExtType = ISD::ZEXTLOAD;
2431 break;
2432 case CCValAssign::AExt:
2433 ExtType = ISD::EXTLOAD;
2434 break;
2435 }
2436
2437 SDValue ArgValue = DAG.getExtLoad(
2438 ExtType, SL, VA.getLocVT(), Chain, FIN,
2440
2441 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2442 if (ConvertedVal == ArgValue)
2443 return ConvertedVal;
2444
2445 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2446}
2447
2448SDValue SITargetLowering::lowerWorkGroupId(
2449 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2452 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2453 if (!Subtarget->hasClusters())
2454 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2455
2456 // Clusters are supported. Return the global position in the grid. If clusters
2457 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2458
2459 // WorkGroupIdXYZ = ClusterId == 0 ?
2460 // ClusterIdXYZ :
2461 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2462 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2463 SDLoc SL(ClusterIdXYZ);
2464 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2465 SDValue One = DAG.getConstant(1, SL, VT);
2466 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2467 SDValue ClusterWorkGroupIdXYZ =
2468 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2469 SDValue GlobalIdXYZ =
2470 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2471 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2472
2473 switch (MFI.getClusterDims().getKind()) {
2476 return GlobalIdXYZ;
2478 return ClusterIdXYZ;
2480 using namespace AMDGPU::Hwreg;
2481 SDValue ClusterIdField =
2482 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2483 SDNode *GetReg =
2484 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2485 SDValue ClusterId(GetReg, 0);
2486 SDValue Zero = DAG.getConstant(0, SL, VT);
2487 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2488 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2489 }
2490 }
2491
2492 llvm_unreachable("nothing should reach here");
2493}
2494
2495SDValue SITargetLowering::getPreloadedValue(
2496 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2498 const ArgDescriptor *Reg = nullptr;
2499 const TargetRegisterClass *RC;
2500 LLT Ty;
2501
2503 const ArgDescriptor WorkGroupIDX =
2504 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2505 // If GridZ is not programmed in an entry function then the hardware will set
2506 // it to all zeros, so there is no need to mask the GridY value in the low
2507 // order bits.
2508 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2509 AMDGPU::TTMP7,
2510 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2511 const ArgDescriptor WorkGroupIDZ =
2512 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2513 const ArgDescriptor ClusterWorkGroupIDX =
2514 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2515 const ArgDescriptor ClusterWorkGroupIDY =
2516 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2517 const ArgDescriptor ClusterWorkGroupIDZ =
2518 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2519 const ArgDescriptor ClusterWorkGroupMaxIDX =
2520 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2521 const ArgDescriptor ClusterWorkGroupMaxIDY =
2522 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2523 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2524 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2525 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2526 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2527
2528 auto LoadConstant = [&](unsigned N) {
2529 return DAG.getConstant(N, SDLoc(), VT);
2530 };
2531
2532 if (Subtarget->hasArchitectedSGPRs() &&
2534 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2535 bool HasFixedDims = ClusterDims.isFixedDims();
2536
2537 switch (PVID) {
2539 Reg = &WorkGroupIDX;
2540 RC = &AMDGPU::SReg_32RegClass;
2541 Ty = LLT::scalar(32);
2542 break;
2544 Reg = &WorkGroupIDY;
2545 RC = &AMDGPU::SReg_32RegClass;
2546 Ty = LLT::scalar(32);
2547 break;
2549 Reg = &WorkGroupIDZ;
2550 RC = &AMDGPU::SReg_32RegClass;
2551 Ty = LLT::scalar(32);
2552 break;
2554 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2555 return LoadConstant(0);
2556 Reg = &ClusterWorkGroupIDX;
2557 RC = &AMDGPU::SReg_32RegClass;
2558 Ty = LLT::scalar(32);
2559 break;
2561 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2562 return LoadConstant(0);
2563 Reg = &ClusterWorkGroupIDY;
2564 RC = &AMDGPU::SReg_32RegClass;
2565 Ty = LLT::scalar(32);
2566 break;
2568 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2569 return LoadConstant(0);
2570 Reg = &ClusterWorkGroupIDZ;
2571 RC = &AMDGPU::SReg_32RegClass;
2572 Ty = LLT::scalar(32);
2573 break;
2575 if (HasFixedDims)
2576 return LoadConstant(ClusterDims.getDims()[0] - 1);
2577 Reg = &ClusterWorkGroupMaxIDX;
2578 RC = &AMDGPU::SReg_32RegClass;
2579 Ty = LLT::scalar(32);
2580 break;
2582 if (HasFixedDims)
2583 return LoadConstant(ClusterDims.getDims()[1] - 1);
2584 Reg = &ClusterWorkGroupMaxIDY;
2585 RC = &AMDGPU::SReg_32RegClass;
2586 Ty = LLT::scalar(32);
2587 break;
2589 if (HasFixedDims)
2590 return LoadConstant(ClusterDims.getDims()[2] - 1);
2591 Reg = &ClusterWorkGroupMaxIDZ;
2592 RC = &AMDGPU::SReg_32RegClass;
2593 Ty = LLT::scalar(32);
2594 break;
2596 Reg = &ClusterWorkGroupMaxFlatID;
2597 RC = &AMDGPU::SReg_32RegClass;
2598 Ty = LLT::scalar(32);
2599 break;
2600 default:
2601 break;
2602 }
2603 }
2604
2605 if (!Reg)
2606 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2607 if (!Reg) {
2609 // It's possible for a kernarg intrinsic call to appear in a kernel with
2610 // no allocated segment, in which case we do not add the user sgpr
2611 // argument, so just return null.
2612 return DAG.getConstant(0, SDLoc(), VT);
2613 }
2614
2615 // It's undefined behavior if a function marked with the amdgpu-no-*
2616 // attributes uses the corresponding intrinsic.
2617 return DAG.getPOISON(VT);
2618 }
2619
2620 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2621}
2622
2624 CallingConv::ID CallConv,
2625 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2626 FunctionType *FType,
2628 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2629 const ISD::InputArg *Arg = &Ins[I];
2630
2631 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2632 "vector type argument should have been split");
2633
2634 // First check if it's a PS input addr.
2635 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2636 PSInputNum <= 15) {
2637 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2638
2639 // Inconveniently only the first part of the split is marked as isSplit,
2640 // so skip to the end. We only want to increment PSInputNum once for the
2641 // entire split argument.
2642 if (Arg->Flags.isSplit()) {
2643 while (!Arg->Flags.isSplitEnd()) {
2644 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2645 "unexpected vector split in ps argument type");
2646 if (!SkipArg)
2647 Splits.push_back(*Arg);
2648 Arg = &Ins[++I];
2649 }
2650 }
2651
2652 if (SkipArg) {
2653 // We can safely skip PS inputs.
2654 Skipped.set(Arg->getOrigArgIndex());
2655 ++PSInputNum;
2656 continue;
2657 }
2658
2659 Info->markPSInputAllocated(PSInputNum);
2660 if (Arg->Used)
2661 Info->markPSInputEnabled(PSInputNum);
2662
2663 ++PSInputNum;
2664 }
2665
2666 Splits.push_back(*Arg);
2667 }
2668}
2669
2670// Allocate special inputs passed in VGPRs.
2672 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2673 SIMachineFunctionInfo &Info) const {
2674 const LLT S32 = LLT::scalar(32);
2676
2677 if (Info.hasWorkItemIDX()) {
2678 Register Reg = AMDGPU::VGPR0;
2679 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2680
2681 CCInfo.AllocateReg(Reg);
2682 unsigned Mask =
2683 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2684 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2685 }
2686
2687 if (Info.hasWorkItemIDY()) {
2688 assert(Info.hasWorkItemIDX());
2689 if (Subtarget->hasPackedTID()) {
2690 Info.setWorkItemIDY(
2691 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2692 } else {
2693 unsigned Reg = AMDGPU::VGPR1;
2694 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2695
2696 CCInfo.AllocateReg(Reg);
2697 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2698 }
2699 }
2700
2701 if (Info.hasWorkItemIDZ()) {
2702 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2703 if (Subtarget->hasPackedTID()) {
2704 Info.setWorkItemIDZ(
2705 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2706 } else {
2707 unsigned Reg = AMDGPU::VGPR2;
2708 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2709
2710 CCInfo.AllocateReg(Reg);
2711 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2712 }
2713 }
2714}
2715
2716// Try to allocate a VGPR at the end of the argument list, or if no argument
2717// VGPRs are left allocating a stack slot.
2718// If \p Mask is is given it indicates bitfield position in the register.
2719// If \p Arg is given use it with new ]p Mask instead of allocating new.
2720static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2721 ArgDescriptor Arg = ArgDescriptor()) {
2722 if (Arg.isSet())
2723 return ArgDescriptor::createArg(Arg, Mask);
2724
2725 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2726 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2727 if (RegIdx == ArgVGPRs.size()) {
2728 // Spill to stack required.
2729 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2730
2731 return ArgDescriptor::createStack(Offset, Mask);
2732 }
2733
2734 unsigned Reg = ArgVGPRs[RegIdx];
2735 Reg = CCInfo.AllocateReg(Reg);
2736 assert(Reg != AMDGPU::NoRegister);
2737
2738 MachineFunction &MF = CCInfo.getMachineFunction();
2739 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2740 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2741 return ArgDescriptor::createRegister(Reg, Mask);
2742}
2743
2745 const TargetRegisterClass *RC,
2746 unsigned NumArgRegs) {
2747 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2748 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2749 if (RegIdx == ArgSGPRs.size())
2750 report_fatal_error("ran out of SGPRs for arguments");
2751
2752 unsigned Reg = ArgSGPRs[RegIdx];
2753 Reg = CCInfo.AllocateReg(Reg);
2754 assert(Reg != AMDGPU::NoRegister);
2755
2756 MachineFunction &MF = CCInfo.getMachineFunction();
2757 MF.addLiveIn(Reg, RC);
2759}
2760
2761// If this has a fixed position, we still should allocate the register in the
2762// CCInfo state. Technically we could get away with this for values passed
2763// outside of the normal argument range.
2765 const TargetRegisterClass *RC,
2766 MCRegister Reg) {
2767 Reg = CCInfo.AllocateReg(Reg);
2768 assert(Reg != AMDGPU::NoRegister);
2769 MachineFunction &MF = CCInfo.getMachineFunction();
2770 MF.addLiveIn(Reg, RC);
2771}
2772
2773static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2774 if (Arg) {
2775 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2776 Arg.getRegister());
2777 } else
2778 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2779}
2780
2781static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2782 if (Arg) {
2783 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2784 Arg.getRegister());
2785 } else
2786 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2787}
2788
2789/// Allocate implicit function VGPR arguments at the end of allocated user
2790/// arguments.
2792 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2793 SIMachineFunctionInfo &Info) const {
2794 const unsigned Mask = 0x3ff;
2795 ArgDescriptor Arg;
2796
2797 if (Info.hasWorkItemIDX()) {
2798 Arg = allocateVGPR32Input(CCInfo, Mask);
2799 Info.setWorkItemIDX(Arg);
2800 }
2801
2802 if (Info.hasWorkItemIDY()) {
2803 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2804 Info.setWorkItemIDY(Arg);
2805 }
2806
2807 if (Info.hasWorkItemIDZ())
2808 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2809}
2810
2811/// Allocate implicit function VGPR arguments in fixed registers.
2813 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2814 SIMachineFunctionInfo &Info) const {
2815 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2816 if (!Reg)
2817 report_fatal_error("failed to allocate VGPR for implicit arguments");
2818
2819 const unsigned Mask = 0x3ff;
2820 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2821 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2822 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2823}
2824
2826 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2827 SIMachineFunctionInfo &Info) const {
2828 auto &ArgInfo = Info.getArgInfo();
2829 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2830
2831 // TODO: Unify handling with private memory pointers.
2832 if (UserSGPRInfo.hasDispatchPtr())
2833 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2834
2835 if (UserSGPRInfo.hasQueuePtr())
2836 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2837
2838 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2839 // constant offset from the kernarg segment.
2840 if (Info.hasImplicitArgPtr())
2841 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2842
2843 if (UserSGPRInfo.hasDispatchID())
2844 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2845
2846 // flat_scratch_init is not applicable for non-kernel functions.
2847
2848 if (Info.hasWorkGroupIDX())
2849 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2850
2851 if (Info.hasWorkGroupIDY())
2852 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2853
2854 if (Info.hasWorkGroupIDZ())
2855 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2856
2857 if (Info.hasLDSKernelId())
2858 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2859}
2860
2861// Allocate special inputs passed in user SGPRs.
2863 MachineFunction &MF,
2864 const SIRegisterInfo &TRI,
2865 SIMachineFunctionInfo &Info) const {
2866 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2867 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2868 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2869 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2870 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2871 }
2872
2873 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2874 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2875 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2876 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2877 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2878 }
2879
2880 if (UserSGPRInfo.hasDispatchPtr()) {
2881 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2882 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2883 CCInfo.AllocateReg(DispatchPtrReg);
2884 }
2885
2886 if (UserSGPRInfo.hasQueuePtr()) {
2887 Register QueuePtrReg = Info.addQueuePtr(TRI);
2888 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2889 CCInfo.AllocateReg(QueuePtrReg);
2890 }
2891
2892 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2894 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2895 CCInfo.AllocateReg(InputPtrReg);
2896
2897 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2898 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2899 }
2900
2901 if (UserSGPRInfo.hasDispatchID()) {
2902 Register DispatchIDReg = Info.addDispatchID(TRI);
2903 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2904 CCInfo.AllocateReg(DispatchIDReg);
2905 }
2906
2907 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2908 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2909 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2910 CCInfo.AllocateReg(FlatScratchInitReg);
2911 }
2912
2913 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2914 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2915 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2916 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2917 }
2918
2919 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2920 // these from the dispatch pointer.
2921}
2922
2923// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2924// sequential starting from the first argument.
2926 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2928 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2929 Function &F = MF.getFunction();
2930 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2931 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2932 bool InPreloadSequence = true;
2933 unsigned InIdx = 0;
2934 bool AlignedForImplictArgs = false;
2935 unsigned ImplicitArgOffset = 0;
2936 for (auto &Arg : F.args()) {
2937 if (!InPreloadSequence || !Arg.hasInRegAttr())
2938 break;
2939
2940 unsigned ArgIdx = Arg.getArgNo();
2941 // Don't preload non-original args or parts not in the current preload
2942 // sequence.
2943 if (InIdx < Ins.size() &&
2944 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2945 break;
2946
2947 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2948 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2949 InIdx++) {
2950 assert(ArgLocs[ArgIdx].isMemLoc());
2951 auto &ArgLoc = ArgLocs[InIdx];
2952 const Align KernelArgBaseAlign = Align(16);
2953 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2954 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2955 unsigned NumAllocSGPRs =
2956 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2957
2958 // Fix alignment for hidden arguments.
2959 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2960 if (!AlignedForImplictArgs) {
2961 ImplicitArgOffset =
2962 alignTo(LastExplicitArgOffset,
2963 Subtarget->getAlignmentForImplicitArgPtr()) -
2964 LastExplicitArgOffset;
2965 AlignedForImplictArgs = true;
2966 }
2967 ArgOffset += ImplicitArgOffset;
2968 }
2969
2970 // Arg is preloaded into the previous SGPR.
2971 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2972 assert(InIdx >= 1 && "No previous SGPR");
2973 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2974 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2975 continue;
2976 }
2977
2978 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2979 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2980 // Check for free user SGPRs for preloading.
2981 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2982 InPreloadSequence = false;
2983 break;
2984 }
2985
2986 // Preload this argument.
2987 const TargetRegisterClass *RC =
2988 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2989 SmallVectorImpl<MCRegister> *PreloadRegs =
2990 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2991
2992 if (PreloadRegs->size() > 1)
2993 RC = &AMDGPU::SGPR_32RegClass;
2994 for (auto &Reg : *PreloadRegs) {
2995 assert(Reg);
2996 MF.addLiveIn(Reg, RC);
2997 CCInfo.AllocateReg(Reg);
2998 }
2999
3000 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3001 }
3002 }
3003}
3004
3006 const SIRegisterInfo &TRI,
3007 SIMachineFunctionInfo &Info) const {
3008 // Always allocate this last since it is a synthetic preload.
3009 if (Info.hasLDSKernelId()) {
3010 Register Reg = Info.addLDSKernelId();
3011 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3012 CCInfo.AllocateReg(Reg);
3013 }
3014}
3015
3016// Allocate special input registers that are initialized per-wave.
3019 CallingConv::ID CallConv,
3020 bool IsShader) const {
3021 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3022 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3023 // Note: user SGPRs are handled by the front-end for graphics shaders
3024 // Pad up the used user SGPRs with dead inputs.
3025
3026 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3027 // before enabling architected SGPRs for workgroup IDs.
3028 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3029
3030 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3031 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3032 // rely on it to reach 16 since if we end up having no stack usage, it will
3033 // not really be added.
3034 unsigned NumRequiredSystemSGPRs =
3035 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3036 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3037 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3038 Register Reg = Info.addReservedUserSGPR();
3039 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3040 CCInfo.AllocateReg(Reg);
3041 }
3042 }
3043
3044 if (!HasArchitectedSGPRs) {
3045 if (Info.hasWorkGroupIDX()) {
3046 Register Reg = Info.addWorkGroupIDX();
3047 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3048 CCInfo.AllocateReg(Reg);
3049 }
3050
3051 if (Info.hasWorkGroupIDY()) {
3052 Register Reg = Info.addWorkGroupIDY();
3053 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3054 CCInfo.AllocateReg(Reg);
3055 }
3056
3057 if (Info.hasWorkGroupIDZ()) {
3058 Register Reg = Info.addWorkGroupIDZ();
3059 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3060 CCInfo.AllocateReg(Reg);
3061 }
3062 }
3063
3064 if (Info.hasWorkGroupInfo()) {
3065 Register Reg = Info.addWorkGroupInfo();
3066 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3067 CCInfo.AllocateReg(Reg);
3068 }
3069
3070 if (Info.hasPrivateSegmentWaveByteOffset()) {
3071 // Scratch wave offset passed in system SGPR.
3072 unsigned PrivateSegmentWaveByteOffsetReg;
3073
3074 if (IsShader) {
3075 PrivateSegmentWaveByteOffsetReg =
3076 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3077
3078 // This is true if the scratch wave byte offset doesn't have a fixed
3079 // location.
3080 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3081 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3082 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3083 }
3084 } else
3085 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3086
3087 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3088 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3089 }
3090
3091 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3092 Info.getNumPreloadedSGPRs() >= 16);
3093}
3094
3096 MachineFunction &MF,
3097 const SIRegisterInfo &TRI,
3099 // Now that we've figured out where the scratch register inputs are, see if
3100 // should reserve the arguments and use them directly.
3101 MachineFrameInfo &MFI = MF.getFrameInfo();
3102 bool HasStackObjects = MFI.hasStackObjects();
3103 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3104
3105 // Record that we know we have non-spill stack objects so we don't need to
3106 // check all stack objects later.
3107 if (HasStackObjects)
3108 Info.setHasNonSpillStackObjects(true);
3109
3110 // Everything live out of a block is spilled with fast regalloc, so it's
3111 // almost certain that spilling will be required.
3113 HasStackObjects = true;
3114
3115 // For now assume stack access is needed in any callee functions, so we need
3116 // the scratch registers to pass in.
3117 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3118
3119 if (!ST.enableFlatScratch()) {
3120 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3121 // If we have stack objects, we unquestionably need the private buffer
3122 // resource. For the Code Object V2 ABI, this will be the first 4 user
3123 // SGPR inputs. We can reserve those and use them directly.
3124
3125 Register PrivateSegmentBufferReg =
3127 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3128 } else {
3129 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3130 // We tentatively reserve the last registers (skipping the last registers
3131 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3132 // we'll replace these with the ones immediately after those which were
3133 // really allocated. In the prologue copies will be inserted from the
3134 // argument to these reserved registers.
3135
3136 // Without HSA, relocations are used for the scratch pointer and the
3137 // buffer resource setup is always inserted in the prologue. Scratch wave
3138 // offset is still in an input SGPR.
3139 Info.setScratchRSrcReg(ReservedBufferReg);
3140 }
3141 }
3142
3144
3145 // For entry functions we have to set up the stack pointer if we use it,
3146 // whereas non-entry functions get this "for free". This means there is no
3147 // intrinsic advantage to using S32 over S34 in cases where we do not have
3148 // calls but do need a frame pointer (i.e. if we are requested to have one
3149 // because frame pointer elimination is disabled). To keep things simple we
3150 // only ever use S32 as the call ABI stack pointer, and so using it does not
3151 // imply we need a separate frame pointer.
3152 //
3153 // Try to use s32 as the SP, but move it if it would interfere with input
3154 // arguments. This won't work with calls though.
3155 //
3156 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3157 // registers.
3158 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3159 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3160 } else {
3162
3163 if (MFI.hasCalls())
3164 report_fatal_error("call in graphics shader with too many input SGPRs");
3165
3166 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3167 if (!MRI.isLiveIn(Reg)) {
3168 Info.setStackPtrOffsetReg(Reg);
3169 break;
3170 }
3171 }
3172
3173 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3174 report_fatal_error("failed to find register for SP");
3175 }
3176
3177 // hasFP should be accurate for entry functions even before the frame is
3178 // finalized, because it does not rely on the known stack size, only
3179 // properties like whether variable sized objects are present.
3180 if (ST.getFrameLowering()->hasFP(MF)) {
3181 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3182 }
3183}
3184
3187 return !Info->isEntryFunction();
3188}
3189
3191
3193 MachineBasicBlock *Entry,
3194 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3196
3197 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3198 if (!IStart)
3199 return;
3200
3201 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3202 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3203 MachineBasicBlock::iterator MBBI = Entry->begin();
3204 for (const MCPhysReg *I = IStart; *I; ++I) {
3205 const TargetRegisterClass *RC = nullptr;
3206 if (AMDGPU::SReg_64RegClass.contains(*I))
3207 RC = &AMDGPU::SGPR_64RegClass;
3208 else if (AMDGPU::SReg_32RegClass.contains(*I))
3209 RC = &AMDGPU::SGPR_32RegClass;
3210 else
3211 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3212
3213 Register NewVR = MRI->createVirtualRegister(RC);
3214 // Create copy from CSR to a virtual register.
3215 Entry->addLiveIn(*I);
3216 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3217 .addReg(*I);
3218
3219 // Insert the copy-back instructions right before the terminator.
3220 for (auto *Exit : Exits)
3221 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3222 TII->get(TargetOpcode::COPY), *I)
3223 .addReg(NewVR);
3224 }
3225}
3226
3228 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3229 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3230 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3232
3234 const Function &Fn = MF.getFunction();
3237 bool IsError = false;
3238
3239 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3241 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3242 IsError = true;
3243 }
3244
3247 BitVector Skipped(Ins.size());
3248 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3249 *DAG.getContext());
3250
3251 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3252 bool IsKernel = AMDGPU::isKernel(CallConv);
3253 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3254
3255 if (IsGraphics) {
3256 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3257 assert(!UserSGPRInfo.hasDispatchPtr() &&
3258 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3259 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3260 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3261 (void)UserSGPRInfo;
3262 if (!Subtarget->enableFlatScratch())
3263 assert(!UserSGPRInfo.hasFlatScratchInit());
3264 if ((CallConv != CallingConv::AMDGPU_CS &&
3265 CallConv != CallingConv::AMDGPU_Gfx &&
3266 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3267 !Subtarget->hasArchitectedSGPRs())
3268 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3269 !Info->hasWorkGroupIDZ());
3270 }
3271
3272 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3273
3274 if (CallConv == CallingConv::AMDGPU_PS) {
3275 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3276
3277 // At least one interpolation mode must be enabled or else the GPU will
3278 // hang.
3279 //
3280 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3281 // set PSInputAddr, the user wants to enable some bits after the compilation
3282 // based on run-time states. Since we can't know what the final PSInputEna
3283 // will look like, so we shouldn't do anything here and the user should take
3284 // responsibility for the correct programming.
3285 //
3286 // Otherwise, the following restrictions apply:
3287 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3288 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3289 // enabled too.
3290 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3291 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3292 CCInfo.AllocateReg(AMDGPU::VGPR0);
3293 CCInfo.AllocateReg(AMDGPU::VGPR1);
3294 Info->markPSInputAllocated(0);
3295 Info->markPSInputEnabled(0);
3296 }
3297 if (Subtarget->isAmdPalOS()) {
3298 // For isAmdPalOS, the user does not enable some bits after compilation
3299 // based on run-time states; the register values being generated here are
3300 // the final ones set in hardware. Therefore we need to apply the
3301 // workaround to PSInputAddr and PSInputEnable together. (The case where
3302 // a bit is set in PSInputAddr but not PSInputEnable is where the
3303 // frontend set up an input arg for a particular interpolation mode, but
3304 // nothing uses that input arg. Really we should have an earlier pass
3305 // that removes such an arg.)
3306 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3307 if ((PsInputBits & 0x7F) == 0 ||
3308 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3309 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3310 }
3311 } else if (IsKernel) {
3312 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3313 } else {
3314 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3315 Ins.end());
3316 }
3317
3318 if (IsKernel)
3319 analyzeFormalArgumentsCompute(CCInfo, Ins);
3320
3321 if (IsEntryFunc) {
3322 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3323 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3324 if (IsKernel && Subtarget->hasKernargPreload())
3325 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3326
3327 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3328 } else if (!IsGraphics) {
3329 // For the fixed ABI, pass workitem IDs in the last argument register.
3330 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3331
3332 // FIXME: Sink this into allocateSpecialInputSGPRs
3333 if (!Subtarget->enableFlatScratch())
3334 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3335
3336 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3337 }
3338
3339 if (!IsKernel) {
3340 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3341 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3342
3343 // This assumes the registers are allocated by CCInfo in ascending order
3344 // with no gaps.
3345 Info->setNumWaveDispatchSGPRs(
3346 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3347 Info->setNumWaveDispatchVGPRs(
3348 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3349 } else if (Info->getNumKernargPreloadedSGPRs()) {
3350 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3351 }
3352
3354
3355 if (IsWholeWaveFunc) {
3356 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3357 {MVT::i1, MVT::Other}, Chain);
3358 InVals.push_back(Setup.getValue(0));
3359 Chains.push_back(Setup.getValue(1));
3360 }
3361
3362 // FIXME: This is the minimum kernel argument alignment. We should improve
3363 // this to the maximum alignment of the arguments.
3364 //
3365 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3366 // kern arg offset.
3367 const Align KernelArgBaseAlign = Align(16);
3368
3369 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3370 ++i) {
3371 const ISD::InputArg &Arg = Ins[i];
3372 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3373 InVals.push_back(DAG.getPOISON(Arg.VT));
3374 continue;
3375 }
3376
3377 CCValAssign &VA = ArgLocs[ArgIdx++];
3378 MVT VT = VA.getLocVT();
3379
3380 if (IsEntryFunc && VA.isMemLoc()) {
3381 VT = Ins[i].VT;
3382 EVT MemVT = VA.getLocVT();
3383
3384 const uint64_t Offset = VA.getLocMemOffset();
3385 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3386
3387 if (Arg.Flags.isByRef()) {
3388 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3389
3390 const GCNTargetMachine &TM =
3391 static_cast<const GCNTargetMachine &>(getTargetMachine());
3392 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3393 Arg.Flags.getPointerAddrSpace())) {
3396 }
3397
3398 InVals.push_back(Ptr);
3399 continue;
3400 }
3401
3402 SDValue NewArg;
3403 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3404 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3405 // In this case the argument is packed into the previous preload SGPR.
3406 int64_t AlignDownOffset = alignDown(Offset, 4);
3407 int64_t OffsetDiff = Offset - AlignDownOffset;
3408 EVT IntVT = MemVT.changeTypeToInteger();
3409
3410 const SIMachineFunctionInfo *Info =
3413 Register Reg =
3414 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3415
3416 assert(Reg);
3417 Register VReg = MRI.getLiveInVirtReg(Reg);
3418 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3419
3420 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3421 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3422
3423 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3424 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3425 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3426 Ins[i].Flags.isSExt(), &Ins[i]);
3427
3428 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3429 } else {
3430 const SIMachineFunctionInfo *Info =
3433 const SmallVectorImpl<MCRegister> &PreloadRegs =
3434 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3435
3436 SDValue Copy;
3437 if (PreloadRegs.size() == 1) {
3438 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3439 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3440 NewArg = DAG.getCopyFromReg(
3441 Chain, DL, VReg,
3443 TRI->getRegSizeInBits(*RC)));
3444
3445 } else {
3446 // If the kernarg alignment does not match the alignment of the SGPR
3447 // tuple RC that can accommodate this argument, it will be built up
3448 // via copies from from the individual SGPRs that the argument was
3449 // preloaded to.
3451 for (auto Reg : PreloadRegs) {
3452 Register VReg = MRI.getLiveInVirtReg(Reg);
3453 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3454 Elts.push_back(Copy);
3455 }
3456 NewArg =
3457 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3458 PreloadRegs.size()),
3459 DL, Elts);
3460 }
3461
3462 // If the argument was preloaded to multiple consecutive 32-bit
3463 // registers because of misalignment between addressable SGPR tuples
3464 // and the argument size, we can still assume that because of kernarg
3465 // segment alignment restrictions that NewArg's size is the same as
3466 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3467 // truncate since we cannot preload to less than a single SGPR and the
3468 // MemVT may be smaller.
3469 EVT MemVTInt =
3471 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3472 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3473
3474 NewArg = DAG.getBitcast(MemVT, NewArg);
3475 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3476 Ins[i].Flags.isSExt(), &Ins[i]);
3477 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3478 }
3479 } else {
3480 // Hidden arguments that are in the kernel signature must be preloaded
3481 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3482 // the argument list and is not preloaded.
3483 if (Arg.isOrigArg()) {
3484 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3485 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3487 *OrigArg->getParent(),
3488 "hidden argument in kernel signature was not preloaded",
3489 DL.getDebugLoc()));
3490 }
3491 }
3492
3493 NewArg =
3494 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3495 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3496 }
3497 Chains.push_back(NewArg.getValue(1));
3498
3499 auto *ParamTy =
3500 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3501 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3502 ParamTy &&
3503 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3504 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3505 // On SI local pointers are just offsets into LDS, so they are always
3506 // less than 16-bits. On CI and newer they could potentially be
3507 // real pointers, so we can't guarantee their size.
3508 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3509 DAG.getValueType(MVT::i16));
3510 }
3511
3512 InVals.push_back(NewArg);
3513 continue;
3514 }
3515 if (!IsEntryFunc && VA.isMemLoc()) {
3516 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3517 InVals.push_back(Val);
3518 if (!Arg.Flags.isByVal())
3519 Chains.push_back(Val.getValue(1));
3520 continue;
3521 }
3522
3523 assert(VA.isRegLoc() && "Parameter must be in a register!");
3524
3525 Register Reg = VA.getLocReg();
3526 const TargetRegisterClass *RC = nullptr;
3527 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3528 RC = &AMDGPU::VGPR_32RegClass;
3529 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3530 RC = &AMDGPU::SGPR_32RegClass;
3531 else
3532 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3533
3534 Reg = MF.addLiveIn(Reg, RC);
3535 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3536
3537 if (Arg.Flags.isSRet()) {
3538 // The return object should be reasonably addressable.
3539
3540 // FIXME: This helps when the return is a real sret. If it is a
3541 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3542 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3543 unsigned NumBits =
3545 Val = DAG.getNode(
3546 ISD::AssertZext, DL, VT, Val,
3547 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3548 }
3549
3550 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3551 InVals.push_back(Val);
3552 }
3553
3554 // Start adding system SGPRs.
3555 if (IsEntryFunc)
3556 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3557
3558 if (DAG.getPass()) {
3559 auto &ArgUsageInfo =
3561 ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo());
3562 } else if (auto *MFAM = DAG.getMFAM()) {
3563 Module &M = *MF.getFunction().getParent();
3564 auto *ArgUsageInfo =
3566 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3567 if (ArgUsageInfo)
3568 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3569 }
3570
3571 unsigned StackArgSize = CCInfo.getStackSize();
3572 Info->setBytesInStackArgArea(StackArgSize);
3573
3574 return Chains.empty() ? Chain
3575 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3576}
3577
3578// TODO: If return values can't fit in registers, we should return as many as
3579// possible in registers before passing on stack.
3581 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3582 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3583 const Type *RetTy) const {
3584 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3585 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3586 // for shaders. Vector types should be explicitly handled by CC.
3587 if (AMDGPU::isEntryFunctionCC(CallConv))
3588 return true;
3589
3591 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3592 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3593 return false;
3594
3595 // We must use the stack if return would require unavailable registers.
3596 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3597 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3598 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3599 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3600 return false;
3601
3602 return true;
3603}
3604
3605SDValue
3607 bool isVarArg,
3609 const SmallVectorImpl<SDValue> &OutVals,
3610 const SDLoc &DL, SelectionDAG &DAG) const {
3614
3615 if (AMDGPU::isKernel(CallConv)) {
3616 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3617 OutVals, DL, DAG);
3618 }
3619
3620 bool IsShader = AMDGPU::isShader(CallConv);
3621
3622 Info->setIfReturnsVoid(Outs.empty());
3623 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3624
3625 // CCValAssign - represent the assignment of the return value to a location.
3627
3628 // CCState - Info about the registers and stack slots.
3629 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3630 *DAG.getContext());
3631
3632 // Analyze outgoing return values.
3633 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3634
3635 SDValue Glue;
3637 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3638
3639 SDValue ReadFirstLane =
3640 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3641 // Copy the result values into the output registers.
3642 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3643 ++I, ++RealRVLocIdx) {
3644 CCValAssign &VA = RVLocs[I];
3645 assert(VA.isRegLoc() && "Can only return in registers!");
3646 // TODO: Partially return in registers if return values don't fit.
3647 SDValue Arg = OutVals[RealRVLocIdx];
3648
3649 // Copied from other backends.
3650 switch (VA.getLocInfo()) {
3651 case CCValAssign::Full:
3652 break;
3653 case CCValAssign::BCvt:
3654 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3655 break;
3656 case CCValAssign::SExt:
3657 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3658 break;
3659 case CCValAssign::ZExt:
3660 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3661 break;
3662 case CCValAssign::AExt:
3663 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3664 break;
3665 default:
3666 llvm_unreachable("Unknown loc info!");
3667 }
3668 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3670 ReadFirstLane, Arg);
3671 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3672 Glue = Chain.getValue(1);
3673 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3674 }
3675
3676 // FIXME: Does sret work properly?
3677 if (!Info->isEntryFunction()) {
3678 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3679 const MCPhysReg *I =
3680 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3681 if (I) {
3682 for (; *I; ++I) {
3683 if (AMDGPU::SReg_64RegClass.contains(*I))
3684 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3685 else if (AMDGPU::SReg_32RegClass.contains(*I))
3686 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3687 else
3688 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3689 }
3690 }
3691 }
3692
3693 // Update chain and glue.
3694 RetOps[0] = Chain;
3695 if (Glue.getNode())
3696 RetOps.push_back(Glue);
3697
3698 unsigned Opc = AMDGPUISD::ENDPGM;
3699 if (!IsWaveEnd)
3700 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3701 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3702 : AMDGPUISD::RET_GLUE;
3703 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3704}
3705
3707 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3708 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3709 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3710 SDValue ThisVal) const {
3711 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3712
3713 // Assign locations to each value returned by this call.
3715 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3716 *DAG.getContext());
3717 CCInfo.AnalyzeCallResult(Ins, RetCC);
3718
3719 // Copy all of the result registers out of their specified physreg.
3720 for (CCValAssign VA : RVLocs) {
3721 SDValue Val;
3722
3723 if (VA.isRegLoc()) {
3724 Val =
3725 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3726 Chain = Val.getValue(1);
3727 InGlue = Val.getValue(2);
3728 } else if (VA.isMemLoc()) {
3729 report_fatal_error("TODO: return values in memory");
3730 } else
3731 llvm_unreachable("unknown argument location type");
3732
3733 switch (VA.getLocInfo()) {
3734 case CCValAssign::Full:
3735 break;
3736 case CCValAssign::BCvt:
3737 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3738 break;
3739 case CCValAssign::ZExt:
3740 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3741 DAG.getValueType(VA.getValVT()));
3742 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3743 break;
3744 case CCValAssign::SExt:
3745 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3746 DAG.getValueType(VA.getValVT()));
3747 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3748 break;
3749 case CCValAssign::AExt:
3750 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3751 break;
3752 default:
3753 llvm_unreachable("Unknown loc info!");
3754 }
3755
3756 InVals.push_back(Val);
3757 }
3758
3759 return Chain;
3760}
3761
3762// Add code to pass special inputs required depending on used features separate
3763// from the explicit user arguments present in the IR.
3765 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3766 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3767 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3768 // If we don't have a call site, this was a call inserted by
3769 // legalization. These can never use special inputs.
3770 if (!CLI.CB)
3771 return;
3772
3773 SelectionDAG &DAG = CLI.DAG;
3774 const SDLoc &DL = CLI.DL;
3775 const Function &F = DAG.getMachineFunction().getFunction();
3776
3777 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3778 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3779
3780 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3782 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3783 if (DAG.getPass()) {
3784 auto &ArgUsageInfo =
3786 CalleeArgInfo =
3787 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3788 } else if (auto *MFAM = DAG.getMFAM()) {
3790 auto *ArgUsageInfo =
3792 DAG.getMachineFunction())
3793 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3794 if (ArgUsageInfo)
3795 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3796 }
3797 }
3798
3799 // TODO: Unify with private memory register handling. This is complicated by
3800 // the fact that at least in kernels, the input argument is not necessarily
3801 // in the same location as the input.
3802 // clang-format off
3803 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3804 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3805 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3806 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3807 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3808 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3809 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3810 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3811 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3812 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3813 };
3814 // clang-format on
3815
3816 for (auto [InputID, Attrs] : ImplicitAttrs) {
3817 // If the callee does not use the attribute value, skip copying the value.
3818 if (all_of(Attrs, [&](StringRef Attr) {
3819 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3820 }))
3821 continue;
3822
3823 const auto [OutgoingArg, ArgRC, ArgTy] =
3824 CalleeArgInfo->getPreloadedValue(InputID);
3825 if (!OutgoingArg)
3826 continue;
3827
3828 const auto [IncomingArg, IncomingArgRC, Ty] =
3829 CallerArgInfo.getPreloadedValue(InputID);
3830 assert(IncomingArgRC == ArgRC);
3831
3832 // All special arguments are ints for now.
3833 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3834 SDValue InputReg;
3835
3836 if (IncomingArg) {
3837 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3838 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3839 // The implicit arg ptr is special because it doesn't have a corresponding
3840 // input for kernels, and is computed from the kernarg segment pointer.
3841 InputReg = getImplicitArgPtr(DAG, DL);
3842 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3843 std::optional<uint32_t> Id =
3845 if (Id.has_value()) {
3846 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3847 } else {
3848 InputReg = DAG.getPOISON(ArgVT);
3849 }
3850 } else {
3851 // We may have proven the input wasn't needed, although the ABI is
3852 // requiring it. We just need to allocate the register appropriately.
3853 InputReg = DAG.getPOISON(ArgVT);
3854 }
3855
3856 if (OutgoingArg->isRegister()) {
3857 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3858 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3859 report_fatal_error("failed to allocate implicit input argument");
3860 } else {
3861 unsigned SpecialArgOffset =
3862 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3863 SDValue ArgStore =
3864 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3865 MemOpChains.push_back(ArgStore);
3866 }
3867 }
3868
3869 // Pack workitem IDs into a single register or pass it as is if already
3870 // packed.
3871
3872 auto [OutgoingArg, ArgRC, Ty] =
3874 if (!OutgoingArg)
3875 std::tie(OutgoingArg, ArgRC, Ty) =
3877 if (!OutgoingArg)
3878 std::tie(OutgoingArg, ArgRC, Ty) =
3880 if (!OutgoingArg)
3881 return;
3882
3883 const ArgDescriptor *IncomingArgX = std::get<0>(
3885 const ArgDescriptor *IncomingArgY = std::get<0>(
3887 const ArgDescriptor *IncomingArgZ = std::get<0>(
3889
3890 SDValue InputReg;
3891 SDLoc SL;
3892
3893 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3894 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3895 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3896
3897 // If incoming ids are not packed we need to pack them.
3898 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3899 NeedWorkItemIDX) {
3900 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3901 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3902 } else {
3903 InputReg = DAG.getConstant(0, DL, MVT::i32);
3904 }
3905 }
3906
3907 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3908 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3909 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3910 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3911 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3912 InputReg = InputReg.getNode()
3913 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3914 : Y;
3915 }
3916
3917 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3918 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3919 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3920 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3921 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3922 InputReg = InputReg.getNode()
3923 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3924 : Z;
3925 }
3926
3927 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3928 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3929 // We're in a situation where the outgoing function requires the workitem
3930 // ID, but the calling function does not have it (e.g a graphics function
3931 // calling a C calling convention function). This is illegal, but we need
3932 // to produce something.
3933 InputReg = DAG.getPOISON(MVT::i32);
3934 } else {
3935 // Workitem ids are already packed, any of present incoming arguments
3936 // will carry all required fields.
3937 ArgDescriptor IncomingArg =
3938 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3939 : IncomingArgY ? *IncomingArgY
3940 : *IncomingArgZ,
3941 ~0u);
3942 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3943 }
3944 }
3945
3946 if (OutgoingArg->isRegister()) {
3947 if (InputReg)
3948 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3949
3950 CCInfo.AllocateReg(OutgoingArg->getRegister());
3951 } else {
3952 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3953 if (InputReg) {
3954 SDValue ArgStore =
3955 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3956 MemOpChains.push_back(ArgStore);
3957 }
3958 }
3959}
3960
3962 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3964 const SmallVectorImpl<SDValue> &OutVals,
3965 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3966 if (AMDGPU::isChainCC(CalleeCC))
3967 return true;
3968
3969 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3970 return false;
3971
3972 // For a divergent call target, we need to do a waterfall loop over the
3973 // possible callees which precludes us from using a simple jump.
3974 if (Callee->isDivergent())
3975 return false;
3976
3978 const Function &CallerF = MF.getFunction();
3979 CallingConv::ID CallerCC = CallerF.getCallingConv();
3981 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3982
3983 // Kernels aren't callable, and don't have a live in return address so it
3984 // doesn't make sense to do a tail call with entry functions.
3985 if (!CallerPreserved)
3986 return false;
3987
3988 bool CCMatch = CallerCC == CalleeCC;
3989
3991 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3992 return true;
3993 return false;
3994 }
3995
3996 // TODO: Can we handle var args?
3997 if (IsVarArg)
3998 return false;
3999
4000 for (const Argument &Arg : CallerF.args()) {
4001 if (Arg.hasByValAttr())
4002 return false;
4003 }
4004
4005 LLVMContext &Ctx = *DAG.getContext();
4006
4007 // Check that the call results are passed in the same way.
4008 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4009 CCAssignFnForCall(CalleeCC, IsVarArg),
4010 CCAssignFnForCall(CallerCC, IsVarArg)))
4011 return false;
4012
4013 // The callee has to preserve all registers the caller needs to preserve.
4014 if (!CCMatch) {
4015 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4016 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4017 return false;
4018 }
4019
4020 // Nothing more to check if the callee is taking no arguments.
4021 if (Outs.empty())
4022 return true;
4023
4025 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4026
4027 // FIXME: We are not allocating special input registers, so we will be
4028 // deciding based on incorrect register assignments.
4029 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4030
4031 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4032 // If the stack arguments for this call do not fit into our own save area then
4033 // the call cannot be made tail.
4034 // TODO: Is this really necessary?
4035 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4036 return false;
4037
4038 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4039 // FIXME: What about inreg arguments that end up passed in memory?
4040 if (!CCVA.isRegLoc())
4041 continue;
4042
4043 // If we are passing an argument in an SGPR, and the value is divergent,
4044 // this call requires a waterfall loop.
4045 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4046 LLVM_DEBUG(
4047 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4048 << printReg(CCVA.getLocReg(), TRI) << '\n');
4049 return false;
4050 }
4051 }
4052
4053 const MachineRegisterInfo &MRI = MF.getRegInfo();
4054 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4055}
4056
4058 if (!CI->isTailCall())
4059 return false;
4060
4061 const Function *ParentFn = CI->getFunction();
4063 return false;
4064 return true;
4065}
4066
4067namespace {
4068// Chain calls have special arguments that we need to handle. These are
4069// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4070// arguments (index 0 and 1 respectively).
4071enum ChainCallArgIdx {
4072 Exec = 2,
4073 Flags,
4074 NumVGPRs,
4075 FallbackExec,
4076 FallbackCallee
4077};
4078} // anonymous namespace
4079
4080// The wave scratch offset register is used as the global base pointer.
4082 SmallVectorImpl<SDValue> &InVals) const {
4083 CallingConv::ID CallConv = CLI.CallConv;
4084 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4085
4086 SelectionDAG &DAG = CLI.DAG;
4087
4088 const SDLoc &DL = CLI.DL;
4089 SDValue Chain = CLI.Chain;
4090 SDValue Callee = CLI.Callee;
4091
4092 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4093 bool UsesDynamicVGPRs = false;
4094 if (IsChainCallConv) {
4095 // The last arguments should be the value that we need to put in EXEC,
4096 // followed by the flags and any other arguments with special meanings.
4097 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4098 // we don't treat them like the "real" arguments.
4099 auto RequestedExecIt =
4100 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4101 return Arg.OrigArgIndex == 2;
4102 });
4103 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4104
4105 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4106 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4107 CLI.OutVals.end());
4108 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4109
4110 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4111 "Haven't popped all the special args");
4112
4113 TargetLowering::ArgListEntry RequestedExecArg =
4114 CLI.Args[ChainCallArgIdx::Exec];
4115 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4116 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4117
4118 // Convert constants into TargetConstants, so they become immediate operands
4119 // instead of being selected into S_MOV.
4120 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4121 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4122 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4123 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4124 } else
4125 ChainCallSpecialArgs.push_back(Arg.Node);
4126 };
4127
4128 PushNodeOrTargetConstant(RequestedExecArg);
4129
4130 // Process any other special arguments depending on the value of the flags.
4131 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4132
4133 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4134 if (FlagsValue.isZero()) {
4135 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4136 return lowerUnhandledCall(CLI, InVals,
4137 "no additional args allowed if flags == 0");
4138 } else if (FlagsValue.isOneBitSet(0)) {
4139 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4140 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4141 }
4142
4143 if (!Subtarget->isWave32()) {
4144 return lowerUnhandledCall(
4145 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4146 }
4147
4148 UsesDynamicVGPRs = true;
4149 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4150 CLI.Args.end(), PushNodeOrTargetConstant);
4151 }
4152 }
4153
4155 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4157 bool &IsTailCall = CLI.IsTailCall;
4158 bool IsVarArg = CLI.IsVarArg;
4159 bool IsSibCall = false;
4161
4162 if (Callee.isUndef() || isNullConstant(Callee)) {
4163 if (!CLI.IsTailCall) {
4164 for (ISD::InputArg &Arg : CLI.Ins)
4165 InVals.push_back(DAG.getPOISON(Arg.VT));
4166 }
4167
4168 return Chain;
4169 }
4170
4171 if (IsVarArg) {
4172 return lowerUnhandledCall(CLI, InVals,
4173 "unsupported call to variadic function ");
4174 }
4175
4176 if (!CLI.CB)
4177 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4178
4179 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4180 return lowerUnhandledCall(CLI, InVals,
4181 "unsupported required tail call to function ");
4182 }
4183
4184 if (IsTailCall) {
4185 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4186 Outs, OutVals, Ins, DAG);
4187 if (!IsTailCall &&
4188 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4189 report_fatal_error("failed to perform tail call elimination on a call "
4190 "site marked musttail or on llvm.amdgcn.cs.chain");
4191 }
4192
4193 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4194
4195 // A sibling call is one where we're under the usual C ABI and not planning
4196 // to change that but can still do a tail call:
4197 if (!TailCallOpt && IsTailCall)
4198 IsSibCall = true;
4199
4200 if (IsTailCall)
4201 ++NumTailCalls;
4202 }
4203
4206 SmallVector<SDValue, 8> MemOpChains;
4207
4208 // Analyze operands of the call, assigning locations to each operand.
4210 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4211 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4212
4213 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4215 // With a fixed ABI, allocate fixed registers before user arguments.
4216 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4217 }
4218
4219 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4220
4221 // Get a count of how many bytes are to be pushed on the stack.
4222 unsigned NumBytes = CCInfo.getStackSize();
4223
4224 if (IsSibCall) {
4225 // Since we're not changing the ABI to make this a tail call, the memory
4226 // operands are already available in the caller's incoming argument space.
4227 NumBytes = 0;
4228 }
4229
4230 // FPDiff is the byte offset of the call's argument area from the callee's.
4231 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4232 // by this amount for a tail call. In a sibling call it must be 0 because the
4233 // caller will deallocate the entire stack and the callee still expects its
4234 // arguments to begin at SP+0. Completely unused for non-tail calls.
4235 int32_t FPDiff = 0;
4236 MachineFrameInfo &MFI = MF.getFrameInfo();
4237 auto *TRI = Subtarget->getRegisterInfo();
4238
4239 // Adjust the stack pointer for the new arguments...
4240 // These operations are automatically eliminated by the prolog/epilog pass
4241 if (!IsSibCall)
4242 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4243
4244 if (!IsSibCall || IsChainCallConv) {
4245 if (!Subtarget->enableFlatScratch()) {
4246 SmallVector<SDValue, 4> CopyFromChains;
4247
4248 // In the HSA case, this should be an identity copy.
4249 SDValue ScratchRSrcReg =
4250 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4251 RegsToPass.emplace_back(IsChainCallConv
4252 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4253 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4254 ScratchRSrcReg);
4255 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4256 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4257 }
4258 }
4259
4260 const unsigned NumSpecialInputs = RegsToPass.size();
4261
4262 MVT PtrVT = MVT::i32;
4263
4264 // Walk the register/memloc assignments, inserting copies/loads.
4265 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4266 CCValAssign &VA = ArgLocs[i];
4267 SDValue Arg = OutVals[i];
4268
4269 // Promote the value if needed.
4270 switch (VA.getLocInfo()) {
4271 case CCValAssign::Full:
4272 break;
4273 case CCValAssign::BCvt:
4274 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4275 break;
4276 case CCValAssign::ZExt:
4277 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4278 break;
4279 case CCValAssign::SExt:
4280 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4281 break;
4282 case CCValAssign::AExt:
4283 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4284 break;
4285 case CCValAssign::FPExt:
4286 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4287 break;
4288 default:
4289 llvm_unreachable("Unknown loc info!");
4290 }
4291
4292 if (VA.isRegLoc()) {
4293 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4294 } else {
4295 assert(VA.isMemLoc());
4296
4297 SDValue DstAddr;
4298 MachinePointerInfo DstInfo;
4299
4300 unsigned LocMemOffset = VA.getLocMemOffset();
4301 int32_t Offset = LocMemOffset;
4302
4303 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4304 MaybeAlign Alignment;
4305
4306 if (IsTailCall) {
4307 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4308 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4309 : VA.getValVT().getStoreSize();
4310
4311 // FIXME: We can have better than the minimum byval required alignment.
4312 Alignment =
4313 Flags.isByVal()
4314 ? Flags.getNonZeroByValAlign()
4315 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4316
4317 Offset = Offset + FPDiff;
4318 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4319
4320 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4321 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4322
4323 // Make sure any stack arguments overlapping with where we're storing
4324 // are loaded before this eventual operation. Otherwise they'll be
4325 // clobbered.
4326
4327 // FIXME: Why is this really necessary? This seems to just result in a
4328 // lot of code to copy the stack and write them back to the same
4329 // locations, which are supposed to be immutable?
4330 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4331 } else {
4332 // Stores to the argument stack area are relative to the stack pointer.
4333 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4334 MVT::i32);
4335 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4336 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4337 Alignment =
4338 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4339 }
4340
4341 if (Outs[i].Flags.isByVal()) {
4342 SDValue SizeNode =
4343 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4344 SDValue Cpy =
4345 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4346 Outs[i].Flags.getNonZeroByValAlign(),
4347 /*isVol = */ false, /*AlwaysInline = */ true,
4348 /*CI=*/nullptr, std::nullopt, DstInfo,
4350
4351 MemOpChains.push_back(Cpy);
4352 } else {
4353 SDValue Store =
4354 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4355 MemOpChains.push_back(Store);
4356 }
4357 }
4358 }
4359
4360 if (!MemOpChains.empty())
4361 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4362
4363 SDValue ReadFirstLaneID =
4364 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4365
4366 SDValue TokenGlue;
4367 if (CLI.ConvergenceControlToken) {
4368 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4370 }
4371
4372 // Build a sequence of copy-to-reg nodes chained together with token chain
4373 // and flag operands which copy the outgoing args into the appropriate regs.
4374 SDValue InGlue;
4375
4376 unsigned ArgIdx = 0;
4377 for (auto [Reg, Val] : RegsToPass) {
4378 if (ArgIdx++ >= NumSpecialInputs &&
4379 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4380 // For chain calls, the inreg arguments are required to be
4381 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4382 // they are uniform.
4383 //
4384 // For other calls, if an inreg arguments is known to be uniform,
4385 // speculatively insert a readfirstlane in case it is in a VGPR.
4386 //
4387 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4388 // value, so let that continue to produce invalid code.
4389
4390 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4391 if (TokenGlue)
4392 ReadfirstlaneArgs.push_back(TokenGlue);
4394 ReadfirstlaneArgs);
4395 }
4396
4397 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4398 InGlue = Chain.getValue(1);
4399 }
4400
4401 // We don't usually want to end the call-sequence here because we would tidy
4402 // the frame up *after* the call, however in the ABI-changing tail-call case
4403 // we've carefully laid out the parameters so that when sp is reset they'll be
4404 // in the correct location.
4405 if (IsTailCall && !IsSibCall) {
4406 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4407 InGlue = Chain.getValue(1);
4408 }
4409
4410 std::vector<SDValue> Ops({Chain});
4411
4412 // Add a redundant copy of the callee global which will not be legalized, as
4413 // we need direct access to the callee later.
4415 const GlobalValue *GV = GSD->getGlobal();
4416 Ops.push_back(Callee);
4417 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4418 } else {
4419 if (IsTailCall) {
4420 // isEligibleForTailCallOptimization considered whether the call target is
4421 // divergent, but we may still end up with a uniform value in a VGPR.
4422 // Insert a readfirstlane just in case.
4423 SDValue ReadFirstLaneID =
4424 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4425
4426 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4427 if (TokenGlue)
4428 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4429 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4430 ReadfirstlaneArgs);
4431 }
4432
4433 Ops.push_back(Callee);
4434 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4435 }
4436
4437 if (IsTailCall) {
4438 // Each tail call may have to adjust the stack by a different amount, so
4439 // this information must travel along with the operation for eventual
4440 // consumption by emitEpilogue.
4441 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4442 }
4443
4444 if (IsChainCallConv)
4445 llvm::append_range(Ops, ChainCallSpecialArgs);
4446
4447 // Add argument registers to the end of the list so that they are known live
4448 // into the call.
4449 for (auto &[Reg, Val] : RegsToPass)
4450 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4451
4452 // Add a register mask operand representing the call-preserved registers.
4453 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4454 assert(Mask && "Missing call preserved mask for calling convention");
4455 Ops.push_back(DAG.getRegisterMask(Mask));
4456
4457 if (SDValue Token = CLI.ConvergenceControlToken) {
4459 GlueOps.push_back(Token);
4460 if (InGlue)
4461 GlueOps.push_back(InGlue);
4462
4463 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4464 MVT::Glue, GlueOps),
4465 0);
4466 }
4467
4468 if (InGlue)
4469 Ops.push_back(InGlue);
4470
4471 // If we're doing a tall call, use a TC_RETURN here rather than an
4472 // actual call instruction.
4473 if (IsTailCall) {
4474 MFI.setHasTailCall();
4475 unsigned OPC = AMDGPUISD::TC_RETURN;
4476 switch (CallConv) {
4478 OPC = AMDGPUISD::TC_RETURN_GFX;
4479 break;
4482 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4483 : AMDGPUISD::TC_RETURN_CHAIN;
4484 break;
4485 }
4486
4487 // If the caller is a whole wave function, we need to use a special opcode
4488 // so we can patch up EXEC.
4489 if (Info->isWholeWaveFunction())
4490 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4491
4492 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4493 }
4494
4495 // Returns a chain and a flag for retval copy to use.
4496 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4497 Chain = Call.getValue(0);
4498 InGlue = Call.getValue(1);
4499
4500 uint64_t CalleePopBytes = NumBytes;
4501 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4502 if (!Ins.empty())
4503 InGlue = Chain.getValue(1);
4504
4505 // Handle result values, copying them out of physregs into vregs that we
4506 // return.
4507 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4508 InVals, /*IsThisReturn=*/false, SDValue());
4509}
4510
4511// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4512// except for:
4513// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4514// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4516 SelectionDAG &DAG) const {
4517 const MachineFunction &MF = DAG.getMachineFunction();
4519
4520 SDLoc dl(Op);
4521 EVT VT = Op.getValueType();
4522 SDValue Chain = Op.getOperand(0);
4523 Register SPReg = Info->getStackPtrOffsetReg();
4524
4525 // Chain the dynamic stack allocation so that it doesn't modify the stack
4526 // pointer when other instructions are using the stack.
4527 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4528
4529 SDValue Size = Op.getOperand(1);
4530 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4531 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4532
4533 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4535 "Stack grows upwards for AMDGPU");
4536
4537 Chain = BaseAddr.getValue(1);
4538 Align StackAlign = TFL->getStackAlign();
4539 if (Alignment > StackAlign) {
4540 uint64_t ScaledAlignment = Alignment.value()
4541 << Subtarget->getWavefrontSizeLog2();
4542 uint64_t StackAlignMask = ScaledAlignment - 1;
4543 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4544 DAG.getConstant(StackAlignMask, dl, VT));
4545 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4546 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4547 }
4548
4549 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4550 SDValue NewSP;
4552 // For constant sized alloca, scale alloca size by wave-size
4553 SDValue ScaledSize = DAG.getNode(
4554 ISD::SHL, dl, VT, Size,
4555 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4556 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4557 } else {
4558 // For dynamic sized alloca, perform wave-wide reduction to get max of
4559 // alloca size(divergent) and then scale it by wave-size
4560 SDValue WaveReduction =
4561 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4562 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4563 Size, DAG.getConstant(0, dl, MVT::i32));
4564 SDValue ScaledSize = DAG.getNode(
4565 ISD::SHL, dl, VT, Size,
4566 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4567 NewSP =
4568 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4569 SDValue ReadFirstLaneID =
4570 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4571 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4572 NewSP);
4573 }
4574
4575 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4576 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4577
4578 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4579}
4580
4582 if (Op.getValueType() != MVT::i32)
4583 return Op; // Defer to cannot select error.
4584
4586 SDLoc SL(Op);
4587
4588 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4589
4590 // Convert from wave uniform to swizzled vector address. This should protect
4591 // from any edge cases where the stacksave result isn't directly used with
4592 // stackrestore.
4593 SDValue VectorAddress =
4594 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4595 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4596}
4597
4599 SelectionDAG &DAG) const {
4600 SDLoc SL(Op);
4601 assert(Op.getValueType() == MVT::i32);
4602
4603 uint32_t BothRoundHwReg =
4605 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4606
4607 SDValue IntrinID =
4608 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4609 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4610 Op.getOperand(0), IntrinID, GetRoundBothImm);
4611
4612 // There are two rounding modes, one for f32 and one for f64/f16. We only
4613 // report in the standard value range if both are the same.
4614 //
4615 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4616 // ties away from zero is not supported, and the other values are rotated by
4617 // 1.
4618 //
4619 // If the two rounding modes are not the same, report a target defined value.
4620
4621 // Mode register rounding mode fields:
4622 //
4623 // [1:0] Single-precision round mode.
4624 // [3:2] Double/Half-precision round mode.
4625 //
4626 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4627 //
4628 // Hardware Spec
4629 // Toward-0 3 0
4630 // Nearest Even 0 1
4631 // +Inf 1 2
4632 // -Inf 2 3
4633 // NearestAway0 N/A 4
4634 //
4635 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4636 // table we can index by the raw hardware mode.
4637 //
4638 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4639
4640 SDValue BitTable =
4642
4643 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4644 SDValue RoundModeTimesNumBits =
4645 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4646
4647 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4648 // knew only one mode was demanded.
4649 SDValue TableValue =
4650 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4651 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4652
4653 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4654 SDValue TableEntry =
4655 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4656
4657 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4658 // if it's an extended value.
4659 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4660 SDValue IsStandardValue =
4661 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4662 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4663 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4664 TableEntry, EnumOffset);
4665
4666 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4667}
4668
4670 SelectionDAG &DAG) const {
4671 SDLoc SL(Op);
4672
4673 SDValue NewMode = Op.getOperand(1);
4674 assert(NewMode.getValueType() == MVT::i32);
4675
4676 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4677 // hardware MODE.fp_round values.
4678 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4679 uint32_t ClampedVal = std::min(
4680 static_cast<uint32_t>(ConstMode->getZExtValue()),
4682 NewMode = DAG.getConstant(
4683 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4684 } else {
4685 // If we know the input can only be one of the supported standard modes in
4686 // the range 0-3, we can use a simplified mapping to hardware values.
4687 KnownBits KB = DAG.computeKnownBits(NewMode);
4688 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4689 // The supported standard values are 0-3. The extended values start at 8. We
4690 // need to offset by 4 if the value is in the extended range.
4691
4692 if (UseReducedTable) {
4693 // Truncate to the low 32-bits.
4694 SDValue BitTable = DAG.getConstant(
4695 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4696
4697 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4698 SDValue RoundModeTimesNumBits =
4699 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4700
4701 NewMode =
4702 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4703
4704 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4705 // the table extracted bits into inline immediates.
4706 } else {
4707 // table_index = umin(value, value - 4)
4708 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4709 SDValue BitTable =
4711
4712 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4713 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4714 SDValue IndexVal =
4715 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4716
4717 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4718 SDValue RoundModeTimesNumBits =
4719 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4720
4721 SDValue TableValue =
4722 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4723 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4724
4725 // No need to mask out the high bits since the setreg will ignore them
4726 // anyway.
4727 NewMode = TruncTable;
4728 }
4729
4730 // Insert a readfirstlane in case the value is a VGPR. We could do this
4731 // earlier and keep more operations scalar, but that interferes with
4732 // combining the source.
4733 SDValue ReadFirstLaneID =
4734 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4735 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4736 ReadFirstLaneID, NewMode);
4737 }
4738
4739 // N.B. The setreg will be later folded into s_round_mode on supported
4740 // targets.
4741 SDValue IntrinID =
4742 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4743 uint32_t BothRoundHwReg =
4745 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4746
4747 SDValue SetReg =
4748 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4749 IntrinID, RoundBothImm, NewMode);
4750
4751 return SetReg;
4752}
4753
4755 if (Op->isDivergent() &&
4756 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4757 // Cannot do I$ prefetch with divergent pointer.
4758 return SDValue();
4759
4760 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4764 break;
4766 if (Subtarget->hasSafeSmemPrefetch())
4767 break;
4768 [[fallthrough]];
4769 default:
4770 return SDValue();
4771 }
4772
4773 // I$ prefetch
4774 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4775 return SDValue();
4776
4777 return Op;
4778}
4779
4780// Work around DAG legality rules only based on the result type.
4782 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4783 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4784 EVT SrcVT = Src.getValueType();
4785
4786 if (SrcVT.getScalarType() != MVT::bf16)
4787 return Op;
4788
4789 SDLoc SL(Op);
4790 SDValue BitCast =
4791 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4792
4793 EVT DstVT = Op.getValueType();
4794 if (IsStrict)
4795 llvm_unreachable("Need STRICT_BF16_TO_FP");
4796
4797 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4798}
4799
4801 SDLoc SL(Op);
4802 if (Op.getValueType() != MVT::i64)
4803 return Op;
4804
4805 uint32_t ModeHwReg =
4807 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4808 uint32_t TrapHwReg =
4810 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4811
4812 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4813 SDValue IntrinID =
4814 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4815 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4816 Op.getOperand(0), IntrinID, ModeHwRegImm);
4817 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4818 Op.getOperand(0), IntrinID, TrapHwRegImm);
4819 SDValue TokenReg =
4820 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4821 GetTrapReg.getValue(1));
4822
4823 SDValue CvtPtr =
4824 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4825 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4826
4827 return DAG.getMergeValues({Result, TokenReg}, SL);
4828}
4829
4831 SDLoc SL(Op);
4832 if (Op.getOperand(1).getValueType() != MVT::i64)
4833 return Op;
4834
4835 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4836 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4837 DAG.getConstant(0, SL, MVT::i32));
4838 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4839 DAG.getConstant(1, SL, MVT::i32));
4840
4841 SDValue ReadFirstLaneID =
4842 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4843 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4844 ReadFirstLaneID, NewModeReg);
4845 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4846 ReadFirstLaneID, NewTrapReg);
4847
4848 unsigned ModeHwReg =
4850 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4851 unsigned TrapHwReg =
4853 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4854
4855 SDValue IntrinID =
4856 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4857 SDValue SetModeReg =
4858 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4859 IntrinID, ModeHwRegImm, NewModeReg);
4860 SDValue SetTrapReg =
4861 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4862 IntrinID, TrapHwRegImm, NewTrapReg);
4863 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4864}
4865
4867 const MachineFunction &MF) const {
4868 const Function &Fn = MF.getFunction();
4869
4871 .Case("m0", AMDGPU::M0)
4872 .Case("exec", AMDGPU::EXEC)
4873 .Case("exec_lo", AMDGPU::EXEC_LO)
4874 .Case("exec_hi", AMDGPU::EXEC_HI)
4875 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4876 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4877 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4878 .Default(Register());
4879 if (!Reg)
4880 return Reg;
4881
4882 if (!Subtarget->hasFlatScrRegister() &&
4883 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4884 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4885 "\" for subtarget."));
4886 }
4887
4888 switch (Reg) {
4889 case AMDGPU::M0:
4890 case AMDGPU::EXEC_LO:
4891 case AMDGPU::EXEC_HI:
4892 case AMDGPU::FLAT_SCR_LO:
4893 case AMDGPU::FLAT_SCR_HI:
4894 if (VT.getSizeInBits() == 32)
4895 return Reg;
4896 break;
4897 case AMDGPU::EXEC:
4898 case AMDGPU::FLAT_SCR:
4899 if (VT.getSizeInBits() == 64)
4900 return Reg;
4901 break;
4902 default:
4903 llvm_unreachable("missing register type checking");
4904 }
4905
4907 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4908}
4909
4910// If kill is not the last instruction, split the block so kill is always a
4911// proper terminator.
4914 MachineBasicBlock *BB) const {
4915 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4917 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4918 return SplitBB;
4919}
4920
4921// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4922// \p MI will be the only instruction in the loop body block. Otherwise, it will
4923// be the first instruction in the remainder block.
4924//
4925/// \returns { LoopBody, Remainder }
4926static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4928 MachineFunction *MF = MBB.getParent();
4930
4931 // To insert the loop we need to split the block. Move everything after this
4932 // point to a new block, and insert a new empty block between the two.
4934 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4936 ++MBBI;
4937
4938 MF->insert(MBBI, LoopBB);
4939 MF->insert(MBBI, RemainderBB);
4940
4941 LoopBB->addSuccessor(LoopBB);
4942 LoopBB->addSuccessor(RemainderBB);
4943
4944 // Move the rest of the block into a new block.
4945 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4946
4947 if (InstInLoop) {
4948 auto Next = std::next(I);
4949
4950 // Move instruction to loop body.
4951 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4952
4953 // Move the rest of the block.
4954 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4955 } else {
4956 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4957 }
4958
4959 MBB.addSuccessor(LoopBB);
4960
4961 return std::pair(LoopBB, RemainderBB);
4962}
4963
4964/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4966 MachineBasicBlock *MBB = MI.getParent();
4968 auto I = MI.getIterator();
4969 auto E = std::next(I);
4970
4971 // clang-format off
4972 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4973 .addImm(0);
4974 // clang-format on
4975
4976 MIBundleBuilder Bundler(*MBB, I, E);
4977 finalizeBundle(*MBB, Bundler.begin());
4978}
4979
4982 MachineBasicBlock *BB) const {
4983 const DebugLoc &DL = MI.getDebugLoc();
4984
4986
4988
4989 // Apparently kill flags are only valid if the def is in the same block?
4990 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4991 Src->setIsKill(false);
4992
4993 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4994
4995 MachineBasicBlock::iterator I = LoopBB->end();
4996
4997 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4999
5000 // Clear TRAP_STS.MEM_VIOL
5001 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5002 .addImm(0)
5003 .addImm(EncodedReg);
5004
5006
5007 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5008
5009 // Load and check TRAP_STS.MEM_VIOL
5010 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5011 .addImm(EncodedReg);
5012
5013 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5014 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5015 .addReg(Reg, RegState::Kill)
5016 .addImm(0);
5017 // clang-format off
5018 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5019 .addMBB(LoopBB);
5020 // clang-format on
5021
5022 return RemainderBB;
5023}
5024
5025// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5026// wavefront. If the value is uniform and just happens to be in a VGPR, this
5027// will only do one iteration. In the worst case, this will loop 64 times.
5028//
5029// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5032 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5033 const DebugLoc &DL, const MachineOperand &Idx,
5034 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5035 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5036 Register &SGPRIdxReg) {
5037
5038 MachineFunction *MF = OrigBB.getParent();
5039 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5040 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5043
5044 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5045 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5046 Register NewExec = MRI.createVirtualRegister(BoolRC);
5047 Register CurrentIdxReg =
5048 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5049 Register CondReg = MRI.createVirtualRegister(BoolRC);
5050
5051 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5052 .addReg(InitReg)
5053 .addMBB(&OrigBB)
5054 .addReg(ResultReg)
5055 .addMBB(&LoopBB);
5056
5057 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5058 .addReg(InitSaveExecReg)
5059 .addMBB(&OrigBB)
5060 .addReg(NewExec)
5061 .addMBB(&LoopBB);
5062
5063 // Read the next variant <- also loop target.
5064 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5065 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5066
5067 // Compare the just read M0 value to all possible Idx values.
5068 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5069 .addReg(CurrentIdxReg)
5070 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5071
5072 // Update EXEC, save the original EXEC value to VCC.
5073 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5074 .addReg(CondReg, RegState::Kill);
5075
5076 MRI.setSimpleHint(NewExec, CondReg);
5077
5078 if (UseGPRIdxMode) {
5079 if (Offset == 0) {
5080 SGPRIdxReg = CurrentIdxReg;
5081 } else {
5082 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5083 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5084 .addReg(CurrentIdxReg, RegState::Kill)
5085 .addImm(Offset);
5086 }
5087 } else {
5088 // Move index from VCC into M0
5089 if (Offset == 0) {
5090 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5091 .addReg(CurrentIdxReg, RegState::Kill);
5092 } else {
5093 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5094 .addReg(CurrentIdxReg, RegState::Kill)
5095 .addImm(Offset);
5096 }
5097 }
5098
5099 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5100 MachineInstr *InsertPt =
5101 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5102 .addReg(LMC.ExecReg)
5103 .addReg(NewExec);
5104
5105 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5106 // s_cbranch_scc0?
5107
5108 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5109 // clang-format off
5110 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5111 .addMBB(&LoopBB);
5112 // clang-format on
5113
5114 return InsertPt->getIterator();
5115}
5116
5117// This has slightly sub-optimal regalloc when the source vector is killed by
5118// the read. The register allocator does not understand that the kill is
5119// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5120// subregister from it, using 1 more VGPR than necessary. This was saved when
5121// this was expanded after register allocation.
5124 unsigned InitResultReg, unsigned PhiReg, int Offset,
5125 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5126 MachineFunction *MF = MBB.getParent();
5127 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5128 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5130 const DebugLoc &DL = MI.getDebugLoc();
5132
5133 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5134 Register DstReg = MI.getOperand(0).getReg();
5135 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5136 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5138
5139 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5140
5141 // Save the EXEC mask
5142 // clang-format off
5143 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5144 .addReg(LMC.ExecReg);
5145 // clang-format on
5146
5147 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5148
5149 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5150
5151 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5152 InitResultReg, DstReg, PhiReg, TmpExec,
5153 Offset, UseGPRIdxMode, SGPRIdxReg);
5154
5155 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5157 ++MBBI;
5158 MF->insert(MBBI, LandingPad);
5159 LoopBB->removeSuccessor(RemainderBB);
5160 LandingPad->addSuccessor(RemainderBB);
5161 LoopBB->addSuccessor(LandingPad);
5162 MachineBasicBlock::iterator First = LandingPad->begin();
5163 // clang-format off
5164 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5165 .addReg(SaveExec);
5166 // clang-format on
5167
5168 return InsPt;
5169}
5170
5171// Returns subreg index, offset
5172static std::pair<unsigned, int>
5174 const TargetRegisterClass *SuperRC, unsigned VecReg,
5175 int Offset) {
5176 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5177
5178 // Skip out of bounds offsets, or else we would end up using an undefined
5179 // register.
5180 if (Offset >= NumElts || Offset < 0)
5181 return std::pair(AMDGPU::sub0, Offset);
5182
5183 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5184}
5185
5188 int Offset) {
5189 MachineBasicBlock *MBB = MI.getParent();
5190 const DebugLoc &DL = MI.getDebugLoc();
5192
5193 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5194
5195 assert(Idx->getReg() != AMDGPU::NoRegister);
5196
5197 if (Offset == 0) {
5198 // clang-format off
5199 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5200 .add(*Idx);
5201 // clang-format on
5202 } else {
5203 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5204 .add(*Idx)
5205 .addImm(Offset);
5206 }
5207}
5208
5211 int Offset) {
5212 MachineBasicBlock *MBB = MI.getParent();
5213 const DebugLoc &DL = MI.getDebugLoc();
5215
5216 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5217
5218 if (Offset == 0)
5219 return Idx->getReg();
5220
5221 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5222 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5223 .add(*Idx)
5224 .addImm(Offset);
5225 return Tmp;
5226}
5227
5230 const GCNSubtarget &ST) {
5231 const SIInstrInfo *TII = ST.getInstrInfo();
5232 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5233 MachineFunction *MF = MBB.getParent();
5235
5236 Register Dst = MI.getOperand(0).getReg();
5237 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5238 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5239 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5240
5241 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5242 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5243
5244 unsigned SubReg;
5245 std::tie(SubReg, Offset) =
5246 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5247
5248 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5249
5250 // Check for a SGPR index.
5251 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5253 const DebugLoc &DL = MI.getDebugLoc();
5254
5255 if (UseGPRIdxMode) {
5256 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5257 // to avoid interfering with other uses, so probably requires a new
5258 // optimization pass.
5260
5261 const MCInstrDesc &GPRIDXDesc =
5262 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5263 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5264 .addReg(SrcReg)
5265 .addReg(Idx)
5266 .addImm(SubReg);
5267 } else {
5269
5270 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5271 .addReg(SrcReg, 0, SubReg)
5272 .addReg(SrcReg, RegState::Implicit);
5273 }
5274
5275 MI.eraseFromParent();
5276
5277 return &MBB;
5278 }
5279
5280 // Control flow needs to be inserted if indexing with a VGPR.
5281 const DebugLoc &DL = MI.getDebugLoc();
5283
5284 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5285 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5286
5287 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5288
5289 Register SGPRIdxReg;
5290 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5291 UseGPRIdxMode, SGPRIdxReg);
5292
5293 MachineBasicBlock *LoopBB = InsPt->getParent();
5294
5295 if (UseGPRIdxMode) {
5296 const MCInstrDesc &GPRIDXDesc =
5297 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5298
5299 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5300 .addReg(SrcReg)
5301 .addReg(SGPRIdxReg)
5302 .addImm(SubReg);
5303 } else {
5304 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5305 .addReg(SrcReg, 0, SubReg)
5306 .addReg(SrcReg, RegState::Implicit);
5307 }
5308
5309 MI.eraseFromParent();
5310
5311 return LoopBB;
5312}
5313
5316 const GCNSubtarget &ST) {
5317 const SIInstrInfo *TII = ST.getInstrInfo();
5318 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5319 MachineFunction *MF = MBB.getParent();
5321
5322 Register Dst = MI.getOperand(0).getReg();
5323 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5324 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5325 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5326 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5327 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5328 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5329
5330 // This can be an immediate, but will be folded later.
5331 assert(Val->getReg());
5332
5333 unsigned SubReg;
5334 std::tie(SubReg, Offset) =
5335 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5336 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5337
5338 if (Idx->getReg() == AMDGPU::NoRegister) {
5340 const DebugLoc &DL = MI.getDebugLoc();
5341
5342 assert(Offset == 0);
5343
5344 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5345 .add(*SrcVec)
5346 .add(*Val)
5347 .addImm(SubReg);
5348
5349 MI.eraseFromParent();
5350 return &MBB;
5351 }
5352
5353 // Check for a SGPR index.
5354 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5356 const DebugLoc &DL = MI.getDebugLoc();
5357
5358 if (UseGPRIdxMode) {
5360
5361 const MCInstrDesc &GPRIDXDesc =
5362 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5363 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5364 .addReg(SrcVec->getReg())
5365 .add(*Val)
5366 .addReg(Idx)
5367 .addImm(SubReg);
5368 } else {
5370
5371 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5372 TRI.getRegSizeInBits(*VecRC), 32, false);
5373 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5374 .addReg(SrcVec->getReg())
5375 .add(*Val)
5376 .addImm(SubReg);
5377 }
5378 MI.eraseFromParent();
5379 return &MBB;
5380 }
5381
5382 // Control flow needs to be inserted if indexing with a VGPR.
5383 if (Val->isReg())
5384 MRI.clearKillFlags(Val->getReg());
5385
5386 const DebugLoc &DL = MI.getDebugLoc();
5387
5388 Register PhiReg = MRI.createVirtualRegister(VecRC);
5389
5390 Register SGPRIdxReg;
5391 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5392 UseGPRIdxMode, SGPRIdxReg);
5393 MachineBasicBlock *LoopBB = InsPt->getParent();
5394
5395 if (UseGPRIdxMode) {
5396 const MCInstrDesc &GPRIDXDesc =
5397 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5398
5399 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5400 .addReg(PhiReg)
5401 .add(*Val)
5402 .addReg(SGPRIdxReg)
5403 .addImm(SubReg);
5404 } else {
5405 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5406 TRI.getRegSizeInBits(*VecRC), 32, false);
5407 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5408 .addReg(PhiReg)
5409 .add(*Val)
5410 .addImm(SubReg);
5411 }
5412
5413 MI.eraseFromParent();
5414 return LoopBB;
5415}
5416
5418 MachineBasicBlock *BB) {
5419 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5420 // For GFX12, we emit s_add_u64 and s_sub_u64.
5421 MachineFunction *MF = BB->getParent();
5422 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5423 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5425 const DebugLoc &DL = MI.getDebugLoc();
5426 MachineOperand &Dest = MI.getOperand(0);
5427 MachineOperand &Src0 = MI.getOperand(1);
5428 MachineOperand &Src1 = MI.getOperand(2);
5429 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5430 if (ST.hasScalarAddSub64()) {
5431 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5432 // clang-format off
5433 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5434 .add(Src0)
5435 .add(Src1);
5436 // clang-format on
5437 } else {
5438 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5439 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5440
5441 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5442 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5443
5444 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5445 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5446 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5447 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5448
5449 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5450 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5451 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5452 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5453
5454 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5455 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5456 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5457 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5458 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5459 .addReg(DestSub0)
5460 .addImm(AMDGPU::sub0)
5461 .addReg(DestSub1)
5462 .addImm(AMDGPU::sub1);
5463 }
5464 MI.eraseFromParent();
5465 return BB;
5466}
5467
5469 switch (Opc) {
5470 case AMDGPU::S_MIN_U32:
5471 return std::numeric_limits<uint32_t>::max();
5472 case AMDGPU::S_MIN_I32:
5473 return std::numeric_limits<int32_t>::max();
5474 case AMDGPU::S_MAX_U32:
5475 return std::numeric_limits<uint32_t>::min();
5476 case AMDGPU::S_MAX_I32:
5477 return std::numeric_limits<int32_t>::min();
5478 case AMDGPU::V_ADD_F32_e64: // -0.0
5479 return 0x80000000;
5480 case AMDGPU::V_SUB_F32_e64: // +0.0
5481 return 0x0;
5482 case AMDGPU::S_ADD_I32:
5483 case AMDGPU::S_SUB_I32:
5484 case AMDGPU::S_OR_B32:
5485 case AMDGPU::S_XOR_B32:
5486 return std::numeric_limits<uint32_t>::min();
5487 case AMDGPU::S_AND_B32:
5488 return std::numeric_limits<uint32_t>::max();
5489 case AMDGPU::V_MIN_F32_e64:
5490 case AMDGPU::V_MAX_F32_e64:
5491 return 0x7fc00000; // qNAN
5492 default:
5494 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5495 }
5496}
5497
5499 switch (Opc) {
5500 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5501 return std::numeric_limits<uint64_t>::max();
5502 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5503 return std::numeric_limits<int64_t>::max();
5504 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5505 return std::numeric_limits<uint64_t>::min();
5506 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5507 return std::numeric_limits<int64_t>::min();
5508 case AMDGPU::S_ADD_U64_PSEUDO:
5509 case AMDGPU::S_SUB_U64_PSEUDO:
5510 case AMDGPU::S_OR_B64:
5511 case AMDGPU::S_XOR_B64:
5512 return std::numeric_limits<uint64_t>::min();
5513 case AMDGPU::S_AND_B64:
5514 return std::numeric_limits<uint64_t>::max();
5515 default:
5517 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5518 }
5519}
5520
5521static bool is32bitWaveReduceOperation(unsigned Opc) {
5522 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5523 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5524 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5525 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5526 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5527 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5528 Opc == AMDGPU::V_SUB_F32_e64;
5529}
5530
5532 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5533 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
5534}
5535
5538 const GCNSubtarget &ST,
5539 unsigned Opc) {
5541 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5542 const DebugLoc &DL = MI.getDebugLoc();
5543 const SIInstrInfo *TII = ST.getInstrInfo();
5544
5545 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5546 Register SrcReg = MI.getOperand(1).getReg();
5547 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5548 Register DstReg = MI.getOperand(0).getReg();
5549 MachineBasicBlock *RetBB = nullptr;
5550 if (isSGPR) {
5551 switch (Opc) {
5552 case AMDGPU::S_MIN_U32:
5553 case AMDGPU::S_MIN_I32:
5554 case AMDGPU::V_MIN_F32_e64:
5555 case AMDGPU::S_MAX_U32:
5556 case AMDGPU::S_MAX_I32:
5557 case AMDGPU::V_MAX_F32_e64:
5558 case AMDGPU::S_AND_B32:
5559 case AMDGPU::S_OR_B32: {
5560 // Idempotent operations.
5561 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5562 RetBB = &BB;
5563 break;
5564 }
5565 case AMDGPU::V_CMP_LT_U64_e64: // umin
5566 case AMDGPU::V_CMP_LT_I64_e64: // min
5567 case AMDGPU::V_CMP_GT_U64_e64: // umax
5568 case AMDGPU::V_CMP_GT_I64_e64: // max
5569 case AMDGPU::S_AND_B64:
5570 case AMDGPU::S_OR_B64: {
5571 // Idempotent operations.
5572 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5573 RetBB = &BB;
5574 break;
5575 }
5576 case AMDGPU::S_XOR_B32:
5577 case AMDGPU::S_XOR_B64:
5578 case AMDGPU::S_ADD_I32:
5579 case AMDGPU::S_ADD_U64_PSEUDO:
5580 case AMDGPU::V_ADD_F32_e64:
5581 case AMDGPU::S_SUB_I32:
5582 case AMDGPU::S_SUB_U64_PSEUDO:
5583 case AMDGPU::V_SUB_F32_e64: {
5584 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5585 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5586 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5587 Register NumActiveLanes =
5588 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5589
5590 bool IsWave32 = ST.isWave32();
5591 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5592 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5593 unsigned BitCountOpc =
5594 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5595
5596 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5597
5598 auto NewAccumulator =
5599 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5600 .addReg(ExecMask);
5601
5602 switch (Opc) {
5603 case AMDGPU::S_XOR_B32:
5604 case AMDGPU::S_XOR_B64: {
5605 // Performing an XOR operation on a uniform value
5606 // depends on the parity of the number of active lanes.
5607 // For even parity, the result will be 0, for odd
5608 // parity the result will be the same as the input value.
5609 Register ParityRegister =
5610 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5611
5612 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5613 .addReg(NewAccumulator->getOperand(0).getReg())
5614 .addImm(1)
5615 .setOperandDead(3); // Dead scc
5616 if (Opc == AMDGPU::S_XOR_B32) {
5617 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5618 .addReg(SrcReg)
5619 .addReg(ParityRegister);
5620 } else {
5621 Register DestSub0 =
5622 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5623 Register DestSub1 =
5624 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5625
5626 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5627 const TargetRegisterClass *SrcSubRC =
5628 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5629
5630 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5631 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5632 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5633 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5634
5635 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5636 .add(Op1L)
5637 .addReg(ParityRegister);
5638
5639 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5640 .add(Op1H)
5641 .addReg(ParityRegister);
5642
5643 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5644 .addReg(DestSub0)
5645 .addImm(AMDGPU::sub0)
5646 .addReg(DestSub1)
5647 .addImm(AMDGPU::sub1);
5648 }
5649 break;
5650 }
5651 case AMDGPU::S_SUB_I32: {
5652 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5653
5654 // Take the negation of the source operand.
5655 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5656 .addImm(0)
5657 .addReg(SrcReg);
5658 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5659 .addReg(NegatedVal)
5660 .addReg(NewAccumulator->getOperand(0).getReg());
5661 break;
5662 }
5663 case AMDGPU::S_ADD_I32: {
5664 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5665 .addReg(SrcReg)
5666 .addReg(NewAccumulator->getOperand(0).getReg());
5667 break;
5668 }
5669 case AMDGPU::S_ADD_U64_PSEUDO:
5670 case AMDGPU::S_SUB_U64_PSEUDO: {
5671 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5672 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5673 Register Op1H_Op0L_Reg =
5674 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5675 Register Op1L_Op0H_Reg =
5676 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5677 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5678 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5679 Register NegatedValLo =
5680 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5681 Register NegatedValHi =
5682 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5683
5684 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5685 const TargetRegisterClass *Src1SubRC =
5686 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5687
5688 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5689 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5690 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5691 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5692
5693 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5694 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5695 .addImm(0)
5696 .addReg(NewAccumulator->getOperand(0).getReg())
5697 .setOperandDead(3); // Dead scc
5698 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5699 .addReg(NegatedValLo)
5700 .addImm(31)
5701 .setOperandDead(3); // Dead scc
5702 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5703 .add(Op1L)
5704 .addReg(NegatedValHi);
5705 }
5706 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5707 ? NegatedValLo
5708 : NewAccumulator->getOperand(0).getReg();
5709 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5710 .add(Op1L)
5711 .addReg(LowOpcode);
5712 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5713 .add(Op1L)
5714 .addReg(LowOpcode);
5715 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5716 .add(Op1H)
5717 .addReg(LowOpcode);
5718
5719 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5720 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5721 .addReg(CarryReg)
5722 .addReg(Op1H_Op0L_Reg)
5723 .setOperandDead(3); // Dead scc
5724
5725 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5726 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5727 .addReg(HiVal)
5728 .addReg(Op1L_Op0H_Reg)
5729 .setOperandDead(3); // Dead scc
5730 }
5731 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5732 .addReg(DestSub0)
5733 .addImm(AMDGPU::sub0)
5734 .addReg(DestSub1)
5735 .addImm(AMDGPU::sub1);
5736 break;
5737 }
5738 case AMDGPU::V_ADD_F32_e64:
5739 case AMDGPU::V_SUB_F32_e64: {
5740 Register ActiveLanesVreg =
5741 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5742 Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5743 // Get number of active lanes as a float val.
5744 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
5745 ActiveLanesVreg)
5746 .addReg(NewAccumulator->getOperand(0).getReg())
5747 .addImm(0) // clamp
5748 .addImm(0); // output-modifier
5749
5750 // Take negation of input for SUB reduction
5751 unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5752 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5753 .addImm(srcMod) // src0 modifier
5754 .addReg(SrcReg)
5755 .addImm(0) // src1 modifier
5756 .addReg(ActiveLanesVreg)
5757 .addImm(0) // clamp
5758 .addImm(0); // output-mod
5759 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5760 .addReg(DstVreg);
5761 }
5762 }
5763 RetBB = &BB;
5764 }
5765 }
5766 } else {
5767 // TODO: Implement DPP Strategy and switch based on immediate strategy
5768 // operand. For now, for all the cases (default, Iterative and DPP we use
5769 // iterative approach by default.)
5770
5771 // To reduce the VGPR using iterative approach, we need to iterate
5772 // over all the active lanes. Lowering consists of ComputeLoop,
5773 // which iterate over only active lanes. We use copy of EXEC register
5774 // as induction variable and every active lane modifies it using bitset0
5775 // so that we will get the next active lane for next iteration.
5777 Register SrcReg = MI.getOperand(1).getReg();
5778 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5780
5781 // Create Control flow for loop
5782 // Split MI's Machine Basic block into For loop
5783 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5784
5785 // Create virtual registers required for lowering.
5786 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5787 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5788 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5789 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5790 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5791 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5792 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5793 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5794 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5795
5796 bool IsWave32 = ST.isWave32();
5797 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5798 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5799
5800 // Create initial values of induction variable from Exec, Accumulator and
5801 // insert branch instr to newly created ComputeBlock
5802 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5803 if (is32BitOpc) {
5805 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5806 .addImm(IdentityValue);
5807 } else {
5809 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5810 .addImm(IdentityValue);
5811 }
5812 // clang-format off
5813 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5814 .addMBB(ComputeLoop);
5815 // clang-format on
5816
5817 // Start constructing ComputeLoop
5818 I = ComputeLoop->begin();
5819 auto Accumulator =
5820 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5821 .addReg(IdentityValReg)
5822 .addMBB(&BB);
5823 auto ActiveBits =
5824 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5825 .addReg(LoopIterator)
5826 .addMBB(&BB);
5827
5828 I = ComputeLoop->end();
5829 MachineInstr *NewAccumulator;
5830 // Perform the computations
5831 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5832 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5833 .addReg(ActiveBitsReg);
5834 if (is32BitOpc) {
5835 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5836 LaneValueReg)
5837 .addReg(SrcReg)
5838 .addReg(FF1Reg);
5839 if (isFPOp) {
5840 Register LaneValVreg =
5841 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5842 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5843 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5844 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5845 LaneValVreg)
5846 .addReg(LaneValueReg);
5847 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5848 .addImm(0) // src0 modifier
5849 .addReg(Accumulator->getOperand(0).getReg())
5850 .addImm(0) // src1 modifier
5851 .addReg(LaneValVreg)
5852 .addImm(0) // clamp
5853 .addImm(0); // omod
5854 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5855 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5856 .addReg(DstVreg);
5857 } else {
5858 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5859 .addReg(Accumulator->getOperand(0).getReg())
5860 .addReg(LaneValueReg);
5861 }
5862 } else {
5863 Register LaneValueLoReg =
5864 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5865 Register LaneValueHiReg =
5866 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5867 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5868 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5869 const TargetRegisterClass *SrcSubRC =
5870 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5871 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5872 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5873 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5874 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5875 // lane value input should be in an sgpr
5876 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5877 LaneValueLoReg)
5878 .add(Op1L)
5879 .addReg(FF1Reg);
5880 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5881 LaneValueHiReg)
5882 .add(Op1H)
5883 .addReg(FF1Reg);
5884 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5885 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5886 .addReg(LaneValueLoReg)
5887 .addImm(AMDGPU::sub0)
5888 .addReg(LaneValueHiReg)
5889 .addImm(AMDGPU::sub1);
5890 switch (Opc) {
5891 case AMDGPU::S_OR_B64:
5892 case AMDGPU::S_AND_B64:
5893 case AMDGPU::S_XOR_B64: {
5894 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5895 .addReg(Accumulator->getOperand(0).getReg())
5896 .addReg(LaneValue->getOperand(0).getReg())
5897 .setOperandDead(3); // Dead scc
5898 break;
5899 }
5900 case AMDGPU::V_CMP_GT_I64_e64:
5901 case AMDGPU::V_CMP_GT_U64_e64:
5902 case AMDGPU::V_CMP_LT_I64_e64:
5903 case AMDGPU::V_CMP_LT_U64_e64: {
5904 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5905 Register ComparisonResultReg =
5906 MRI.createVirtualRegister(WaveMaskRegClass);
5907 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5908 const TargetRegisterClass *VSubRegClass =
5909 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5910 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5911 MachineOperand SrcReg0Sub0 =
5912 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5913 VregClass, AMDGPU::sub0, VSubRegClass);
5914 MachineOperand SrcReg0Sub1 =
5915 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5916 VregClass, AMDGPU::sub1, VSubRegClass);
5917 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5918 AccumulatorVReg)
5919 .add(SrcReg0Sub0)
5920 .addImm(AMDGPU::sub0)
5921 .add(SrcReg0Sub1)
5922 .addImm(AMDGPU::sub1);
5923 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5924 .addReg(LaneValue->getOperand(0).getReg())
5925 .addReg(AccumulatorVReg);
5926
5927 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5928 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5929 .addReg(LaneMaskReg)
5930 .addReg(ActiveBitsReg);
5931
5932 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5933 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5934 .addReg(LaneValue->getOperand(0).getReg())
5935 .addReg(Accumulator->getOperand(0).getReg());
5936 break;
5937 }
5938 case AMDGPU::S_ADD_U64_PSEUDO:
5939 case AMDGPU::S_SUB_U64_PSEUDO: {
5940 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5941 .addReg(Accumulator->getOperand(0).getReg())
5942 .addReg(LaneValue->getOperand(0).getReg());
5943 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5944 break;
5945 }
5946 }
5947 }
5948 // Manipulate the iterator to get the next active lane
5949 unsigned BITSETOpc =
5950 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5951 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5952 .addReg(FF1Reg)
5953 .addReg(ActiveBitsReg);
5954
5955 // Add phi nodes
5956 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5957 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5958
5959 // Creating branching
5960 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5961 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5962 .addReg(NewActiveBitsReg)
5963 .addImm(0);
5964 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5965 .addMBB(ComputeLoop);
5966
5967 RetBB = ComputeEnd;
5968 }
5969 MI.eraseFromParent();
5970 return RetBB;
5971}
5972
5975 MachineBasicBlock *BB) const {
5976 MachineFunction *MF = BB->getParent();
5978 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5980 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5982 const DebugLoc &DL = MI.getDebugLoc();
5983
5984 switch (MI.getOpcode()) {
5985 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5986 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5987 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5988 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5989 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5990 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5991 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5992 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5993 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
5994 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
5995 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5996 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5997 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5998 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5999 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6000 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6001 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6002 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6003 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6004 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6005 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6006 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6007 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6008 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6009 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6010 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6011 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6012 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6013 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6014 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6015 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6016 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6017 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6018 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6019 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6020 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6021 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6022 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6023 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6024 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6025 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6026 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6027 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6028 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6029 case AMDGPU::S_UADDO_PSEUDO:
6030 case AMDGPU::S_USUBO_PSEUDO: {
6031 MachineOperand &Dest0 = MI.getOperand(0);
6032 MachineOperand &Dest1 = MI.getOperand(1);
6033 MachineOperand &Src0 = MI.getOperand(2);
6034 MachineOperand &Src1 = MI.getOperand(3);
6035
6036 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6037 ? AMDGPU::S_ADD_U32
6038 : AMDGPU::S_SUB_U32;
6039 // clang-format off
6040 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6041 .add(Src0)
6042 .add(Src1);
6043 // clang-format on
6044
6045 unsigned SelOpc =
6046 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6047 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6048
6049 MI.eraseFromParent();
6050 return BB;
6051 }
6052 case AMDGPU::S_ADD_U64_PSEUDO:
6053 case AMDGPU::S_SUB_U64_PSEUDO: {
6054 return Expand64BitScalarArithmetic(MI, BB);
6055 }
6056 case AMDGPU::V_ADD_U64_PSEUDO:
6057 case AMDGPU::V_SUB_U64_PSEUDO: {
6058 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6059
6060 MachineOperand &Dest = MI.getOperand(0);
6061 MachineOperand &Src0 = MI.getOperand(1);
6062 MachineOperand &Src1 = MI.getOperand(2);
6063
6064 if (ST.hasAddSubU64Insts()) {
6065 auto I = BuildMI(*BB, MI, DL,
6066 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6067 : AMDGPU::V_SUB_U64_e64),
6068 Dest.getReg())
6069 .add(Src0)
6070 .add(Src1)
6071 .addImm(0); // clamp
6072 TII->legalizeOperands(*I);
6073 MI.eraseFromParent();
6074 return BB;
6075 }
6076
6077 if (IsAdd && ST.hasLshlAddU64Inst()) {
6078 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6079 Dest.getReg())
6080 .add(Src0)
6081 .addImm(0)
6082 .add(Src1);
6083 TII->legalizeOperands(*Add);
6084 MI.eraseFromParent();
6085 return BB;
6086 }
6087
6088 const auto *CarryRC = TRI->getWaveMaskRegClass();
6089
6090 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6091 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6092
6093 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6094 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6095
6096 const TargetRegisterClass *Src0RC = Src0.isReg()
6097 ? MRI.getRegClass(Src0.getReg())
6098 : &AMDGPU::VReg_64RegClass;
6099 const TargetRegisterClass *Src1RC = Src1.isReg()
6100 ? MRI.getRegClass(Src1.getReg())
6101 : &AMDGPU::VReg_64RegClass;
6102
6103 const TargetRegisterClass *Src0SubRC =
6104 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6105 const TargetRegisterClass *Src1SubRC =
6106 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6107
6108 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6109 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6110 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6111 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6112
6113 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6114 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6115 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6116 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6117
6118 unsigned LoOpc =
6119 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6120 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6121 .addReg(CarryReg, RegState::Define)
6122 .add(SrcReg0Sub0)
6123 .add(SrcReg1Sub0)
6124 .addImm(0); // clamp bit
6125
6126 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6127 MachineInstr *HiHalf =
6128 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6129 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6130 .add(SrcReg0Sub1)
6131 .add(SrcReg1Sub1)
6132 .addReg(CarryReg, RegState::Kill)
6133 .addImm(0); // clamp bit
6134
6135 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6136 .addReg(DestSub0)
6137 .addImm(AMDGPU::sub0)
6138 .addReg(DestSub1)
6139 .addImm(AMDGPU::sub1);
6140 TII->legalizeOperands(*LoHalf);
6141 TII->legalizeOperands(*HiHalf);
6142 MI.eraseFromParent();
6143 return BB;
6144 }
6145 case AMDGPU::S_ADD_CO_PSEUDO:
6146 case AMDGPU::S_SUB_CO_PSEUDO: {
6147 // This pseudo has a chance to be selected
6148 // only from uniform add/subcarry node. All the VGPR operands
6149 // therefore assumed to be splat vectors.
6151 MachineOperand &Dest = MI.getOperand(0);
6152 MachineOperand &CarryDest = MI.getOperand(1);
6153 MachineOperand &Src0 = MI.getOperand(2);
6154 MachineOperand &Src1 = MI.getOperand(3);
6155 MachineOperand &Src2 = MI.getOperand(4);
6156 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6157 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6158 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6159 .addReg(Src0.getReg());
6160 Src0.setReg(RegOp0);
6161 }
6162 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6163 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6164 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6165 .addReg(Src1.getReg());
6166 Src1.setReg(RegOp1);
6167 }
6168 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6169 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6170 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6171 .addReg(Src2.getReg());
6172 Src2.setReg(RegOp2);
6173 }
6174
6175 if (ST.isWave64()) {
6176 if (ST.hasScalarCompareEq64()) {
6177 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6178 .addReg(Src2.getReg())
6179 .addImm(0);
6180 } else {
6181 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6182 const TargetRegisterClass *SubRC =
6183 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6184 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6185 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6186 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6187 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6188 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6189
6190 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6191 .add(Src2Sub0)
6192 .add(Src2Sub1);
6193
6194 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6195 .addReg(Src2_32, RegState::Kill)
6196 .addImm(0);
6197 }
6198 } else {
6199 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6200 .addReg(Src2.getReg())
6201 .addImm(0);
6202 }
6203
6204 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6205 ? AMDGPU::S_ADDC_U32
6206 : AMDGPU::S_SUBB_U32;
6207
6208 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6209
6210 unsigned SelOpc =
6211 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6212
6213 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6214 .addImm(-1)
6215 .addImm(0);
6216
6217 MI.eraseFromParent();
6218 return BB;
6219 }
6220 case AMDGPU::SI_INIT_M0: {
6221 MachineOperand &M0Init = MI.getOperand(0);
6222 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6223 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6224 AMDGPU::M0)
6225 .add(M0Init);
6226 MI.eraseFromParent();
6227 return BB;
6228 }
6229 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6230 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6231 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6232 TII->get(AMDGPU::S_CMP_EQ_U32))
6233 .addImm(0)
6234 .addImm(0);
6235 return BB;
6236 }
6237 case AMDGPU::GET_GROUPSTATICSIZE: {
6238 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6239 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6240 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6241 .add(MI.getOperand(0))
6242 .addImm(MFI->getLDSSize());
6243 MI.eraseFromParent();
6244 return BB;
6245 }
6246 case AMDGPU::GET_SHADERCYCLESHILO: {
6248 // The algorithm is:
6249 //
6250 // hi1 = getreg(SHADER_CYCLES_HI)
6251 // lo1 = getreg(SHADER_CYCLES_LO)
6252 // hi2 = getreg(SHADER_CYCLES_HI)
6253 //
6254 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6255 // Otherwise there was overflow and the result is hi2:0. In both cases the
6256 // result should represent the actual time at some point during the sequence
6257 // of three getregs.
6258 using namespace AMDGPU::Hwreg;
6259 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6260 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6261 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6262 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6263 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6264 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6265 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6266 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6267 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6268 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6269 .addReg(RegHi1)
6270 .addReg(RegHi2);
6271 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6272 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6273 .addReg(RegLo1)
6274 .addImm(0);
6275 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6276 .add(MI.getOperand(0))
6277 .addReg(RegLo)
6278 .addImm(AMDGPU::sub0)
6279 .addReg(RegHi2)
6280 .addImm(AMDGPU::sub1);
6281 MI.eraseFromParent();
6282 return BB;
6283 }
6284 case AMDGPU::SI_INDIRECT_SRC_V1:
6285 case AMDGPU::SI_INDIRECT_SRC_V2:
6286 case AMDGPU::SI_INDIRECT_SRC_V3:
6287 case AMDGPU::SI_INDIRECT_SRC_V4:
6288 case AMDGPU::SI_INDIRECT_SRC_V5:
6289 case AMDGPU::SI_INDIRECT_SRC_V6:
6290 case AMDGPU::SI_INDIRECT_SRC_V7:
6291 case AMDGPU::SI_INDIRECT_SRC_V8:
6292 case AMDGPU::SI_INDIRECT_SRC_V9:
6293 case AMDGPU::SI_INDIRECT_SRC_V10:
6294 case AMDGPU::SI_INDIRECT_SRC_V11:
6295 case AMDGPU::SI_INDIRECT_SRC_V12:
6296 case AMDGPU::SI_INDIRECT_SRC_V16:
6297 case AMDGPU::SI_INDIRECT_SRC_V32:
6298 return emitIndirectSrc(MI, *BB, *getSubtarget());
6299 case AMDGPU::SI_INDIRECT_DST_V1:
6300 case AMDGPU::SI_INDIRECT_DST_V2:
6301 case AMDGPU::SI_INDIRECT_DST_V3:
6302 case AMDGPU::SI_INDIRECT_DST_V4:
6303 case AMDGPU::SI_INDIRECT_DST_V5:
6304 case AMDGPU::SI_INDIRECT_DST_V6:
6305 case AMDGPU::SI_INDIRECT_DST_V7:
6306 case AMDGPU::SI_INDIRECT_DST_V8:
6307 case AMDGPU::SI_INDIRECT_DST_V9:
6308 case AMDGPU::SI_INDIRECT_DST_V10:
6309 case AMDGPU::SI_INDIRECT_DST_V11:
6310 case AMDGPU::SI_INDIRECT_DST_V12:
6311 case AMDGPU::SI_INDIRECT_DST_V16:
6312 case AMDGPU::SI_INDIRECT_DST_V32:
6313 return emitIndirectDst(MI, *BB, *getSubtarget());
6314 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6315 case AMDGPU::SI_KILL_I1_PSEUDO:
6316 return splitKillBlock(MI, BB);
6317 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6318 Register Dst = MI.getOperand(0).getReg();
6319 const MachineOperand &Src0 = MI.getOperand(1);
6320 const MachineOperand &Src1 = MI.getOperand(2);
6321 Register SrcCond = MI.getOperand(3).getReg();
6322
6323 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6324 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6325 const auto *CondRC = TRI->getWaveMaskRegClass();
6326 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6327
6328 const TargetRegisterClass *Src0RC = Src0.isReg()
6329 ? MRI.getRegClass(Src0.getReg())
6330 : &AMDGPU::VReg_64RegClass;
6331 const TargetRegisterClass *Src1RC = Src1.isReg()
6332 ? MRI.getRegClass(Src1.getReg())
6333 : &AMDGPU::VReg_64RegClass;
6334
6335 const TargetRegisterClass *Src0SubRC =
6336 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6337 const TargetRegisterClass *Src1SubRC =
6338 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6339
6340 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6341 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6342 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6343 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6344
6345 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6346 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6347 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6348 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6349
6350 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6351 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6352 .addImm(0)
6353 .add(Src0Sub0)
6354 .addImm(0)
6355 .add(Src1Sub0)
6356 .addReg(SrcCondCopy);
6357 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6358 .addImm(0)
6359 .add(Src0Sub1)
6360 .addImm(0)
6361 .add(Src1Sub1)
6362 .addReg(SrcCondCopy);
6363
6364 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6365 .addReg(DstLo)
6366 .addImm(AMDGPU::sub0)
6367 .addReg(DstHi)
6368 .addImm(AMDGPU::sub1);
6369 MI.eraseFromParent();
6370 return BB;
6371 }
6372 case AMDGPU::SI_BR_UNDEF: {
6373 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6374 .add(MI.getOperand(0));
6375 Br->getOperand(1).setIsUndef(); // read undef SCC
6376 MI.eraseFromParent();
6377 return BB;
6378 }
6379 case AMDGPU::ADJCALLSTACKUP:
6380 case AMDGPU::ADJCALLSTACKDOWN: {
6382 MachineInstrBuilder MIB(*MF, &MI);
6383 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6384 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6385 return BB;
6386 }
6387 case AMDGPU::SI_CALL_ISEL: {
6388 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6389
6391 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6392
6393 for (const MachineOperand &MO : MI.operands())
6394 MIB.add(MO);
6395
6396 MIB.cloneMemRefs(MI);
6397 MI.eraseFromParent();
6398 return BB;
6399 }
6400 case AMDGPU::V_ADD_CO_U32_e32:
6401 case AMDGPU::V_SUB_CO_U32_e32:
6402 case AMDGPU::V_SUBREV_CO_U32_e32: {
6403 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6404 unsigned Opc = MI.getOpcode();
6405
6406 bool NeedClampOperand = false;
6407 if (TII->pseudoToMCOpcode(Opc) == -1) {
6409 NeedClampOperand = true;
6410 }
6411
6412 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6413 if (TII->isVOP3(*I)) {
6414 I.addReg(TRI->getVCC(), RegState::Define);
6415 }
6416 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6417 if (NeedClampOperand)
6418 I.addImm(0); // clamp bit for e64 encoding
6419
6420 TII->legalizeOperands(*I);
6421
6422 MI.eraseFromParent();
6423 return BB;
6424 }
6425 case AMDGPU::V_ADDC_U32_e32:
6426 case AMDGPU::V_SUBB_U32_e32:
6427 case AMDGPU::V_SUBBREV_U32_e32:
6428 // These instructions have an implicit use of vcc which counts towards the
6429 // constant bus limit.
6430 TII->legalizeOperands(MI);
6431 return BB;
6432 case AMDGPU::DS_GWS_INIT:
6433 case AMDGPU::DS_GWS_SEMA_BR:
6434 case AMDGPU::DS_GWS_BARRIER:
6435 case AMDGPU::DS_GWS_SEMA_V:
6436 case AMDGPU::DS_GWS_SEMA_P:
6437 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6438 // A s_waitcnt 0 is required to be the instruction immediately following.
6439 if (getSubtarget()->hasGWSAutoReplay()) {
6441 return BB;
6442 }
6443
6444 return emitGWSMemViolTestLoop(MI, BB);
6445 case AMDGPU::S_SETREG_B32: {
6446 // Try to optimize cases that only set the denormal mode or rounding mode.
6447 //
6448 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6449 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6450 // instead.
6451 //
6452 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6453 // allow you to have a no side effect instruction in the output of a
6454 // sideeffecting pattern.
6455 auto [ID, Offset, Width] =
6456 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6458 return BB;
6459
6460 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6461 const unsigned SetMask = WidthMask << Offset;
6462
6463 if (getSubtarget()->hasDenormModeInst()) {
6464 unsigned SetDenormOp = 0;
6465 unsigned SetRoundOp = 0;
6466
6467 // The dedicated instructions can only set the whole denorm or round mode
6468 // at once, not a subset of bits in either.
6469 if (SetMask ==
6471 // If this fully sets both the round and denorm mode, emit the two
6472 // dedicated instructions for these.
6473 SetRoundOp = AMDGPU::S_ROUND_MODE;
6474 SetDenormOp = AMDGPU::S_DENORM_MODE;
6475 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6476 SetRoundOp = AMDGPU::S_ROUND_MODE;
6477 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6478 SetDenormOp = AMDGPU::S_DENORM_MODE;
6479 }
6480
6481 if (SetRoundOp || SetDenormOp) {
6482 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6483 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6484 unsigned ImmVal = Def->getOperand(1).getImm();
6485 if (SetRoundOp) {
6486 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6487 .addImm(ImmVal & 0xf);
6488
6489 // If we also have the denorm mode, get just the denorm mode bits.
6490 ImmVal >>= 4;
6491 }
6492
6493 if (SetDenormOp) {
6494 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6495 .addImm(ImmVal & 0xf);
6496 }
6497
6498 MI.eraseFromParent();
6499 return BB;
6500 }
6501 }
6502 }
6503
6504 // If only FP bits are touched, used the no side effects pseudo.
6505 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6506 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6507 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6508
6509 return BB;
6510 }
6511 case AMDGPU::S_INVERSE_BALLOT_U32:
6512 case AMDGPU::S_INVERSE_BALLOT_U64:
6513 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6514 // necessary. After that they are equivalent to a COPY.
6515 MI.setDesc(TII->get(AMDGPU::COPY));
6516 return BB;
6517 case AMDGPU::ENDPGM_TRAP: {
6518 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6519 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6520 MI.addOperand(MachineOperand::CreateImm(0));
6521 return BB;
6522 }
6523
6524 // We need a block split to make the real endpgm a terminator. We also don't
6525 // want to break phis in successor blocks, so we can't just delete to the
6526 // end of the block.
6527
6528 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6530 MF->push_back(TrapBB);
6531 // clang-format off
6532 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6533 .addImm(0);
6534 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6535 .addMBB(TrapBB);
6536 // clang-format on
6537
6538 BB->addSuccessor(TrapBB);
6539 MI.eraseFromParent();
6540 return SplitBB;
6541 }
6542 case AMDGPU::SIMULATED_TRAP: {
6543 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6544 MachineBasicBlock *SplitBB =
6545 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6546 MI.eraseFromParent();
6547 return SplitBB;
6548 }
6549 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6550 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6552
6553 // During ISel, it's difficult to propagate the original EXEC mask to use as
6554 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6555 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6556 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6557 Register OriginalExec = Setup->getOperand(0).getReg();
6558 MF->getRegInfo().clearKillFlags(OriginalExec);
6559 MI.getOperand(0).setReg(OriginalExec);
6560 return BB;
6561 }
6562 default:
6563 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6564 if (!MI.mayStore())
6566 return BB;
6567 }
6569 }
6570}
6571
6573 // This currently forces unfolding various combinations of fsub into fma with
6574 // free fneg'd operands. As long as we have fast FMA (controlled by
6575 // isFMAFasterThanFMulAndFAdd), we should perform these.
6576
6577 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6578 // most of these combines appear to be cycle neutral but save on instruction
6579 // count / code size.
6580 return true;
6581}
6582
6584
6586 EVT VT) const {
6587 if (!VT.isVector()) {
6588 return MVT::i1;
6589 }
6590 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6591}
6592
6594 // TODO: Should i16 be used always if legal? For now it would force VALU
6595 // shifts.
6596 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6597}
6598
6600 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6601 ? Ty.changeElementSize(16)
6602 : Ty.changeElementSize(32);
6603}
6604
6605// Answering this is somewhat tricky and depends on the specific device which
6606// have different rates for fma or all f64 operations.
6607//
6608// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6609// regardless of which device (although the number of cycles differs between
6610// devices), so it is always profitable for f64.
6611//
6612// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6613// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6614// which we can always do even without fused FP ops since it returns the same
6615// result as the separate operations and since it is always full
6616// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6617// however does not support denormals, so we do report fma as faster if we have
6618// a fast fma device and require denormals.
6619//
6621 EVT VT) const {
6622 VT = VT.getScalarType();
6623
6624 switch (VT.getSimpleVT().SimpleTy) {
6625 case MVT::f32: {
6626 // If mad is not available this depends only on if f32 fma is full rate.
6627 if (!Subtarget->hasMadMacF32Insts())
6628 return Subtarget->hasFastFMAF32();
6629
6630 // Otherwise f32 mad is always full rate and returns the same result as
6631 // the separate operations so should be preferred over fma.
6632 // However does not support denormals.
6634 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6635
6636 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6637 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6638 }
6639 case MVT::f64:
6640 return true;
6641 case MVT::f16:
6642 case MVT::bf16:
6643 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6644 default:
6645 break;
6646 }
6647
6648 return false;
6649}
6650
6652 LLT Ty) const {
6653 switch (Ty.getScalarSizeInBits()) {
6654 case 16:
6655 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6656 case 32:
6657 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6658 case 64:
6659 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6660 default:
6661 break;
6662 }
6663
6664 return false;
6665}
6666
6668 if (!Ty.isScalar())
6669 return false;
6670
6671 if (Ty.getScalarSizeInBits() == 16)
6672 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6673 if (Ty.getScalarSizeInBits() == 32)
6674 return Subtarget->hasMadMacF32Insts() &&
6675 denormalModeIsFlushAllF32(*MI.getMF());
6676
6677 return false;
6678}
6679
6681 const SDNode *N) const {
6682 // TODO: Check future ftz flag
6683 // v_mad_f32/v_mac_f32 do not support denormals.
6684 EVT VT = N->getValueType(0);
6685 if (VT == MVT::f32)
6686 return Subtarget->hasMadMacF32Insts() &&
6688 if (VT == MVT::f16) {
6689 return Subtarget->hasMadF16() &&
6691 }
6692
6693 return false;
6694}
6695
6696//===----------------------------------------------------------------------===//
6697// Custom DAG Lowering Operations
6698//===----------------------------------------------------------------------===//
6699
6700// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6701// wider vector type is legal.
6703 SelectionDAG &DAG) const {
6704 unsigned Opc = Op.getOpcode();
6705 EVT VT = Op.getValueType();
6706 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6707 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6708 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6709 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6710 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6711 VT == MVT::v32bf16);
6712
6713 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6714
6715 SDLoc SL(Op);
6716 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6717 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6718
6719 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6720}
6721
6722// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6723// regression whereby extra unnecessary instructions were added to codegen
6724// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6725// instructions to extract the result from the vector.
6727 [[maybe_unused]] EVT VT = Op.getValueType();
6728
6729 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6730 VT == MVT::v16i32) &&
6731 "Unexpected ValueType.");
6732
6733 return DAG.UnrollVectorOp(Op.getNode());
6734}
6735
6736// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6737// wider vector type is legal.
6739 SelectionDAG &DAG) const {
6740 unsigned Opc = Op.getOpcode();
6741 EVT VT = Op.getValueType();
6742 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6743 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6744 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6745 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6746 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6747 VT == MVT::v32bf16);
6748
6749 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6750 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6751
6752 SDLoc SL(Op);
6753
6754 SDValue OpLo =
6755 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6756 SDValue OpHi =
6757 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6758
6759 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6760}
6761
6763 SelectionDAG &DAG) const {
6764 unsigned Opc = Op.getOpcode();
6765 EVT VT = Op.getValueType();
6766 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6767 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6768 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6769 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6770 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6771 VT == MVT::v32bf16);
6772
6773 SDValue Op0 = Op.getOperand(0);
6774 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6775 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6776 : std::pair(Op0, Op0);
6777
6778 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6779 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6780
6781 SDLoc SL(Op);
6782 auto ResVT = DAG.GetSplitDestVTs(VT);
6783
6784 SDValue OpLo =
6785 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6786 SDValue OpHi =
6787 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6788
6789 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6790}
6791
6793 switch (Op.getOpcode()) {
6794 default:
6796 case ISD::BRCOND:
6797 return LowerBRCOND(Op, DAG);
6798 case ISD::RETURNADDR:
6799 return LowerRETURNADDR(Op, DAG);
6800 case ISD::LOAD: {
6801 SDValue Result = LowerLOAD(Op, DAG);
6802 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6803 "Load should return a value and a chain");
6804 return Result;
6805 }
6806 case ISD::FSQRT: {
6807 EVT VT = Op.getValueType();
6808 if (VT == MVT::f32)
6809 return lowerFSQRTF32(Op, DAG);
6810 if (VT == MVT::f64)
6811 return lowerFSQRTF64(Op, DAG);
6812 return SDValue();
6813 }
6814 case ISD::FSIN:
6815 case ISD::FCOS:
6816 return LowerTrig(Op, DAG);
6817 case ISD::SELECT:
6818 return LowerSELECT(Op, DAG);
6819 case ISD::FDIV:
6820 return LowerFDIV(Op, DAG);
6821 case ISD::FFREXP:
6822 return LowerFFREXP(Op, DAG);
6824 return LowerATOMIC_CMP_SWAP(Op, DAG);
6825 case ISD::STORE:
6826 return LowerSTORE(Op, DAG);
6827 case ISD::GlobalAddress: {
6830 return LowerGlobalAddress(MFI, Op, DAG);
6831 }
6833 return LowerExternalSymbol(Op, DAG);
6835 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6837 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6839 return LowerINTRINSIC_VOID(Op, DAG);
6840 case ISD::ADDRSPACECAST:
6841 return lowerADDRSPACECAST(Op, DAG);
6843 return lowerINSERT_SUBVECTOR(Op, DAG);
6845 return lowerINSERT_VECTOR_ELT(Op, DAG);
6847 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6849 return lowerVECTOR_SHUFFLE(Op, DAG);
6851 return lowerSCALAR_TO_VECTOR(Op, DAG);
6852 case ISD::BUILD_VECTOR:
6853 return lowerBUILD_VECTOR(Op, DAG);
6854 case ISD::FP_ROUND:
6856 return lowerFP_ROUND(Op, DAG);
6857 case ISD::TRAP:
6858 return lowerTRAP(Op, DAG);
6859 case ISD::DEBUGTRAP:
6860 return lowerDEBUGTRAP(Op, DAG);
6861 case ISD::ABS:
6862 case ISD::FABS:
6863 case ISD::FNEG:
6864 case ISD::FCANONICALIZE:
6865 case ISD::BSWAP:
6866 return splitUnaryVectorOp(Op, DAG);
6867 case ISD::FMINNUM:
6868 case ISD::FMAXNUM:
6869 return lowerFMINNUM_FMAXNUM(Op, DAG);
6870 case ISD::FMINIMUMNUM:
6871 case ISD::FMAXIMUMNUM:
6872 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6873 case ISD::FMINIMUM:
6874 case ISD::FMAXIMUM:
6875 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6876 case ISD::FLDEXP:
6877 case ISD::STRICT_FLDEXP:
6878 return lowerFLDEXP(Op, DAG);
6879 case ISD::FMA:
6880 return splitTernaryVectorOp(Op, DAG);
6881 case ISD::FP_TO_SINT:
6882 case ISD::FP_TO_UINT:
6883 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
6884 Op.getValueType() == MVT::i16 &&
6885 Op.getOperand(0).getValueType() == MVT::f32) {
6886 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
6887 return Op;
6888 }
6889 return LowerFP_TO_INT(Op, DAG);
6890 case ISD::SHL:
6891 case ISD::SRA:
6892 case ISD::SRL:
6893 case ISD::ADD:
6894 case ISD::SUB:
6895 case ISD::SMIN:
6896 case ISD::SMAX:
6897 case ISD::UMIN:
6898 case ISD::UMAX:
6899 case ISD::FADD:
6900 case ISD::FMUL:
6901 case ISD::FMINNUM_IEEE:
6902 case ISD::FMAXNUM_IEEE:
6903 case ISD::UADDSAT:
6904 case ISD::USUBSAT:
6905 case ISD::SADDSAT:
6906 case ISD::SSUBSAT:
6907 return splitBinaryVectorOp(Op, DAG);
6908 case ISD::FCOPYSIGN:
6909 return lowerFCOPYSIGN(Op, DAG);
6910 case ISD::MUL:
6911 return lowerMUL(Op, DAG);
6912 case ISD::SMULO:
6913 case ISD::UMULO:
6914 return lowerXMULO(Op, DAG);
6915 case ISD::SMUL_LOHI:
6916 case ISD::UMUL_LOHI:
6917 return lowerXMUL_LOHI(Op, DAG);
6919 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6920 case ISD::STACKSAVE:
6921 return LowerSTACKSAVE(Op, DAG);
6922 case ISD::GET_ROUNDING:
6923 return lowerGET_ROUNDING(Op, DAG);
6924 case ISD::SET_ROUNDING:
6925 return lowerSET_ROUNDING(Op, DAG);
6926 case ISD::PREFETCH:
6927 return lowerPREFETCH(Op, DAG);
6928 case ISD::FP_EXTEND:
6930 return lowerFP_EXTEND(Op, DAG);
6931 case ISD::GET_FPENV:
6932 return lowerGET_FPENV(Op, DAG);
6933 case ISD::SET_FPENV:
6934 return lowerSET_FPENV(Op, DAG);
6935 case ISD::ROTR:
6936 return lowerROTR(Op, DAG);
6937 }
6938 return SDValue();
6939}
6940
6941// Used for D16: Casts the result of an instruction into the right vector,
6942// packs values if loads return unpacked values.
6944 const SDLoc &DL, SelectionDAG &DAG,
6945 bool Unpacked) {
6946 if (!LoadVT.isVector())
6947 return Result;
6948
6949 // Cast back to the original packed type or to a larger type that is a
6950 // multiple of 32 bit for D16. Widening the return type is a required for
6951 // legalization.
6952 EVT FittingLoadVT = LoadVT;
6953 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6954 FittingLoadVT =
6956 LoadVT.getVectorNumElements() + 1);
6957 }
6958
6959 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6960 // Truncate to v2i16/v4i16.
6961 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6962
6963 // Workaround legalizer not scalarizing truncate after vector op
6964 // legalization but not creating intermediate vector trunc.
6966 DAG.ExtractVectorElements(Result, Elts);
6967 for (SDValue &Elt : Elts)
6968 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6969
6970 // Pad illegal v1i16/v3fi6 to v4i16
6971 if ((LoadVT.getVectorNumElements() % 2) == 1)
6972 Elts.push_back(DAG.getPOISON(MVT::i16));
6973
6974 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6975
6976 // Bitcast to original type (v2f16/v4f16).
6977 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6978 }
6979
6980 // Cast back to the original packed type.
6981 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6982}
6983
6984SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6985 SelectionDAG &DAG,
6987 bool IsIntrinsic) const {
6988 SDLoc DL(M);
6989
6990 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6991 EVT LoadVT = M->getValueType(0);
6992
6993 EVT EquivLoadVT = LoadVT;
6994 if (LoadVT.isVector()) {
6995 if (Unpacked) {
6996 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6997 LoadVT.getVectorNumElements());
6998 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6999 // Widen v3f16 to legal type
7000 EquivLoadVT =
7002 LoadVT.getVectorNumElements() + 1);
7003 }
7004 }
7005
7006 // Change from v4f16/v2f16 to EquivLoadVT.
7007 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7008
7010 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7011 M->getMemoryVT(), M->getMemOperand());
7012
7013 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7014
7015 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7016}
7017
7018SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7019 SelectionDAG &DAG,
7020 ArrayRef<SDValue> Ops) const {
7021 SDLoc DL(M);
7022 EVT LoadVT = M->getValueType(0);
7023 EVT EltType = LoadVT.getScalarType();
7024 EVT IntVT = LoadVT.changeTypeToInteger();
7025
7026 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7027
7028 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7029 bool IsTFE = M->getNumValues() == 3;
7030
7031 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7032 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7033 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7034 : AMDGPUISD::BUFFER_LOAD;
7035
7036 if (IsD16) {
7037 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7038 }
7039
7040 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7041 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7042 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7043 IsTFE);
7044
7045 if (isTypeLegal(LoadVT)) {
7046 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7047 M->getMemOperand(), DAG);
7048 }
7049
7050 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7051 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7052 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7053 M->getMemOperand(), DAG);
7054 return DAG.getMergeValues(
7055 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7056 DL);
7057}
7058
7060 SelectionDAG &DAG) {
7061 EVT VT = N->getValueType(0);
7062 unsigned CondCode = N->getConstantOperandVal(3);
7063 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7064 return DAG.getPOISON(VT);
7065
7066 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7067
7068 SDValue LHS = N->getOperand(1);
7069 SDValue RHS = N->getOperand(2);
7070
7071 SDLoc DL(N);
7072
7073 EVT CmpVT = LHS.getValueType();
7074 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7075 unsigned PromoteOp =
7077 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7078 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7079 }
7080
7081 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7082
7083 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7084 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7085
7086 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7087 DAG.getCondCode(CCOpcode));
7088 if (VT.bitsEq(CCVT))
7089 return SetCC;
7090 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7091}
7092
7094 SelectionDAG &DAG) {
7095 EVT VT = N->getValueType(0);
7096
7097 unsigned CondCode = N->getConstantOperandVal(3);
7098 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7099 return DAG.getPOISON(VT);
7100
7101 SDValue Src0 = N->getOperand(1);
7102 SDValue Src1 = N->getOperand(2);
7103 EVT CmpVT = Src0.getValueType();
7104 SDLoc SL(N);
7105
7106 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7107 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7108 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7109 }
7110
7111 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7112 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7113 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7114 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7115 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7116 DAG.getCondCode(CCOpcode));
7117 if (VT.bitsEq(CCVT))
7118 return SetCC;
7119 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7120}
7121
7123 SelectionDAG &DAG) {
7124 EVT VT = N->getValueType(0);
7125 SDValue Src = N->getOperand(1);
7126 SDLoc SL(N);
7127
7128 if (Src.getOpcode() == ISD::SETCC) {
7129 SDValue Op0 = Src.getOperand(0);
7130 SDValue Op1 = Src.getOperand(1);
7131 // Need to expand bfloat to float for comparison (setcc).
7132 if (Op0.getValueType() == MVT::bf16) {
7133 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7134 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7135 }
7136 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7137 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7138 }
7139 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7140 // (ballot 0) -> 0
7141 if (Arg->isZero())
7142 return DAG.getConstant(0, SL, VT);
7143
7144 // (ballot 1) -> EXEC/EXEC_LO
7145 if (Arg->isOne()) {
7146 Register Exec;
7147 if (VT.getScalarSizeInBits() == 32)
7148 Exec = AMDGPU::EXEC_LO;
7149 else if (VT.getScalarSizeInBits() == 64)
7150 Exec = AMDGPU::EXEC;
7151 else
7152 return SDValue();
7153
7154 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7155 }
7156 }
7157
7158 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7159 // ISD::SETNE)
7160 return DAG.getNode(
7161 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7162 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7163}
7164
7166 SelectionDAG &DAG) {
7167 EVT VT = N->getValueType(0);
7168 unsigned ValSize = VT.getSizeInBits();
7169 unsigned IID = N->getConstantOperandVal(0);
7170 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7171 IID == Intrinsic::amdgcn_permlanex16;
7172 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7173 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7174 SDLoc SL(N);
7175 MVT IntVT = MVT::getIntegerVT(ValSize);
7176 const GCNSubtarget *ST = TLI.getSubtarget();
7177 unsigned SplitSize = 32;
7178 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7179 ST->hasDPALU_DPP() &&
7180 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7181 SplitSize = 64;
7182
7183 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7184 SDValue Src2, MVT ValT) -> SDValue {
7185 SmallVector<SDValue, 8> Operands;
7186 switch (IID) {
7187 case Intrinsic::amdgcn_permlane16:
7188 case Intrinsic::amdgcn_permlanex16:
7189 case Intrinsic::amdgcn_update_dpp:
7190 Operands.push_back(N->getOperand(6));
7191 Operands.push_back(N->getOperand(5));
7192 Operands.push_back(N->getOperand(4));
7193 [[fallthrough]];
7194 case Intrinsic::amdgcn_writelane:
7195 Operands.push_back(Src2);
7196 [[fallthrough]];
7197 case Intrinsic::amdgcn_readlane:
7198 case Intrinsic::amdgcn_set_inactive:
7199 case Intrinsic::amdgcn_set_inactive_chain_arg:
7200 case Intrinsic::amdgcn_mov_dpp8:
7201 Operands.push_back(Src1);
7202 [[fallthrough]];
7203 case Intrinsic::amdgcn_readfirstlane:
7204 case Intrinsic::amdgcn_permlane64:
7205 Operands.push_back(Src0);
7206 break;
7207 default:
7208 llvm_unreachable("unhandled lane op");
7209 }
7210
7211 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7212 std::reverse(Operands.begin(), Operands.end());
7213
7214 if (SDNode *GL = N->getGluedNode()) {
7215 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7216 GL = GL->getOperand(0).getNode();
7217 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7218 SDValue(GL, 0)));
7219 }
7220
7221 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7222 };
7223
7224 SDValue Src0 = N->getOperand(1);
7225 SDValue Src1, Src2;
7226 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7227 IID == Intrinsic::amdgcn_mov_dpp8 ||
7228 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7229 Src1 = N->getOperand(2);
7230 if (IID == Intrinsic::amdgcn_writelane ||
7231 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7232 Src2 = N->getOperand(3);
7233 }
7234
7235 if (ValSize == SplitSize) {
7236 // Already legal
7237 return SDValue();
7238 }
7239
7240 if (ValSize < 32) {
7241 bool IsFloat = VT.isFloatingPoint();
7242 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7243 SL, MVT::i32);
7244
7245 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7246 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7247 SL, MVT::i32);
7248 }
7249
7250 if (IID == Intrinsic::amdgcn_writelane) {
7251 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7252 SL, MVT::i32);
7253 }
7254
7255 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7256 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7257 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7258 }
7259
7260 if (ValSize % SplitSize != 0)
7261 return SDValue();
7262
7263 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7264 EVT VT = N->getValueType(0);
7265 unsigned NE = VT.getVectorNumElements();
7266 EVT EltVT = VT.getVectorElementType();
7268 unsigned NumOperands = N->getNumOperands();
7269 SmallVector<SDValue, 4> Operands(NumOperands);
7270 SDNode *GL = N->getGluedNode();
7271
7272 // only handle convergencectrl_glue
7274
7275 for (unsigned i = 0; i != NE; ++i) {
7276 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7277 ++j) {
7278 SDValue Operand = N->getOperand(j);
7279 EVT OperandVT = Operand.getValueType();
7280 if (OperandVT.isVector()) {
7281 // A vector operand; extract a single element.
7282 EVT OperandEltVT = OperandVT.getVectorElementType();
7283 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7284 Operand, DAG.getVectorIdxConstant(i, SL));
7285 } else {
7286 // A scalar operand; just use it as is.
7287 Operands[j] = Operand;
7288 }
7289 }
7290
7291 if (GL)
7292 Operands[NumOperands - 1] =
7293 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7294 SDValue(GL->getOperand(0).getNode(), 0));
7295
7296 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7297 }
7298
7299 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7300 return DAG.getBuildVector(VecVT, SL, Scalars);
7301 };
7302
7303 if (VT.isVector()) {
7304 switch (MVT::SimpleValueType EltTy =
7306 case MVT::i32:
7307 case MVT::f32:
7308 if (SplitSize == 32) {
7309 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7310 return unrollLaneOp(LaneOp.getNode());
7311 }
7312 [[fallthrough]];
7313 case MVT::i16:
7314 case MVT::f16:
7315 case MVT::bf16: {
7316 unsigned SubVecNumElt =
7317 SplitSize / VT.getVectorElementType().getSizeInBits();
7318 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7320 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7321 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7322 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7323 DAG.getConstant(EltIdx, SL, MVT::i32));
7324
7325 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7326 IsPermLane16)
7327 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7328 DAG.getConstant(EltIdx, SL, MVT::i32));
7329
7330 if (IID == Intrinsic::amdgcn_writelane)
7331 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7332 DAG.getConstant(EltIdx, SL, MVT::i32));
7333
7334 Pieces.push_back(
7335 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7336 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7337 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7338 EltIdx += SubVecNumElt;
7339 }
7340 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7341 }
7342 default:
7343 // Handle all other cases by bitcasting to i32 vectors
7344 break;
7345 }
7346 }
7347
7348 MVT VecVT =
7349 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7350 Src0 = DAG.getBitcast(VecVT, Src0);
7351
7352 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7353 Src1 = DAG.getBitcast(VecVT, Src1);
7354
7355 if (IID == Intrinsic::amdgcn_writelane)
7356 Src2 = DAG.getBitcast(VecVT, Src2);
7357
7358 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7359 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7360 return DAG.getBitcast(VT, UnrolledLaneOp);
7361}
7362
7364 SelectionDAG &DAG) {
7365 EVT VT = N->getValueType(0);
7366
7367 if (VT.getSizeInBits() != 32)
7368 return SDValue();
7369
7370 SDLoc SL(N);
7371
7372 SDValue Value = N->getOperand(1);
7373 SDValue Index = N->getOperand(2);
7374
7375 // ds_bpermute requires index to be multiplied by 4
7376 SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
7377 SDValue ShiftedIndex =
7378 DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
7379
7380 // Intrinsics will require i32 to operate on
7381 SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
7382
7383 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7384 SmallVector<SDValue> IntrinArgs) -> SDValue {
7385 SmallVector<SDValue> Operands(1);
7386 Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
7387 Operands.append(IntrinArgs);
7388 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
7389 };
7390
7391 // If we can bpermute across the whole wave, then just do that
7393 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7394 {ShiftedIndex, ValueI32});
7395 return DAG.getBitcast(VT, BPermute);
7396 }
7397
7398 assert(TLI.getSubtarget()->isWave64());
7399
7400 // Otherwise, we need to make use of whole wave mode
7401 SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
7402
7403 // Set inactive lanes to poison
7404 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7405 {ValueI32, PoisonVal});
7406 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7407 {ShiftedIndex, PoisonVal});
7408
7409 SDValue Swapped =
7410 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7411
7412 // Get permutation of each half, then we'll select which one to use
7413 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7414 {WWMIndex, WWMValue});
7415 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7416 MVT::i32, {WWMIndex, Swapped});
7417 SDValue BPermOtherHalfWWM =
7418 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7419
7420 // Select which side to take the permute from
7421 SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
7422 // We can get away with only using mbcnt_lo here since we're only
7423 // trying to detect which side of 32 each lane is on, and mbcnt_lo
7424 // returns 32 for lanes 32-63.
7425 SDValue ThreadID =
7426 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7427 {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7428
7429 SDValue SameOrOtherHalf =
7430 DAG.getNode(ISD::AND, SL, MVT::i32,
7431 DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7432 DAG.getTargetConstant(32, SL, MVT::i32));
7433 SDValue UseSameHalf =
7434 DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7435 DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7436 SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7437 BPermOtherHalfWWM);
7438 return DAG.getBitcast(VT, Result);
7439}
7440
7443 SelectionDAG &DAG) const {
7444 switch (N->getOpcode()) {
7446 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7447 Results.push_back(Res);
7448 return;
7449 }
7451 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7452 Results.push_back(Res);
7453 return;
7454 }
7456 unsigned IID = N->getConstantOperandVal(0);
7457 switch (IID) {
7458 case Intrinsic::amdgcn_make_buffer_rsrc:
7459 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7460 return;
7461 case Intrinsic::amdgcn_cvt_pkrtz: {
7462 SDValue Src0 = N->getOperand(1);
7463 SDValue Src1 = N->getOperand(2);
7464 SDLoc SL(N);
7465 SDValue Cvt =
7466 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7467 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7468 return;
7469 }
7470 case Intrinsic::amdgcn_cvt_pknorm_i16:
7471 case Intrinsic::amdgcn_cvt_pknorm_u16:
7472 case Intrinsic::amdgcn_cvt_pk_i16:
7473 case Intrinsic::amdgcn_cvt_pk_u16: {
7474 SDValue Src0 = N->getOperand(1);
7475 SDValue Src1 = N->getOperand(2);
7476 SDLoc SL(N);
7477 unsigned Opcode;
7478
7479 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7480 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7481 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7482 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7483 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7484 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7485 else
7486 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7487
7488 EVT VT = N->getValueType(0);
7489 if (isTypeLegal(VT))
7490 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7491 else {
7492 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7493 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7494 }
7495 return;
7496 }
7497 case Intrinsic::amdgcn_s_buffer_load: {
7498 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7499 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7500 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7501 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7502 // s_buffer_load_i8.
7503 if (!Subtarget->hasScalarSubwordLoads())
7504 return;
7505 SDValue Op = SDValue(N, 0);
7506 SDValue Rsrc = Op.getOperand(1);
7507 SDValue Offset = Op.getOperand(2);
7508 SDValue CachePolicy = Op.getOperand(3);
7509 EVT VT = Op.getValueType();
7510 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7511 SDLoc DL(Op);
7513 const DataLayout &DataLayout = DAG.getDataLayout();
7514 Align Alignment =
7520 VT.getStoreSize(), Alignment);
7521 SDValue LoadVal;
7522 if (!Offset->isDivergent()) {
7523 SDValue Ops[] = {Rsrc, // source register
7524 Offset, CachePolicy};
7525 SDValue BufferLoad =
7526 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7527 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7528 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7529 } else {
7530 SDValue Ops[] = {
7531 DAG.getEntryNode(), // Chain
7532 Rsrc, // rsrc
7533 DAG.getConstant(0, DL, MVT::i32), // vindex
7534 {}, // voffset
7535 {}, // soffset
7536 {}, // offset
7537 CachePolicy, // cachepolicy
7538 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7539 };
7540 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7541 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7542 }
7543 Results.push_back(LoadVal);
7544 return;
7545 }
7546 case Intrinsic::amdgcn_dead: {
7547 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7548 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7549 return;
7550 }
7551 }
7552 break;
7553 }
7555 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7556 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7557 // FIXME: Hacky
7558 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7559 Results.push_back(Res.getOperand(I));
7560 }
7561 } else {
7562 Results.push_back(Res);
7563 Results.push_back(Res.getValue(1));
7564 }
7565 return;
7566 }
7567
7568 break;
7569 }
7570 case ISD::SELECT: {
7571 SDLoc SL(N);
7572 EVT VT = N->getValueType(0);
7573 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7574 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7575 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7576
7577 EVT SelectVT = NewVT;
7578 if (NewVT.bitsLT(MVT::i32)) {
7579 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7580 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7581 SelectVT = MVT::i32;
7582 }
7583
7584 SDValue NewSelect =
7585 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7586
7587 if (NewVT != SelectVT)
7588 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7589 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7590 return;
7591 }
7592 case ISD::FNEG: {
7593 if (N->getValueType(0) != MVT::v2f16)
7594 break;
7595
7596 SDLoc SL(N);
7597 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7598
7599 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7600 DAG.getConstant(0x80008000, SL, MVT::i32));
7601 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7602 return;
7603 }
7604 case ISD::FABS: {
7605 if (N->getValueType(0) != MVT::v2f16)
7606 break;
7607
7608 SDLoc SL(N);
7609 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7610
7611 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7612 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7613 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7614 return;
7615 }
7616 case ISD::FSQRT: {
7617 if (N->getValueType(0) != MVT::f16)
7618 break;
7619 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7620 break;
7621 }
7622 default:
7624 break;
7625 }
7626}
7627
7628/// Helper function for LowerBRCOND
7629static SDNode *findUser(SDValue Value, unsigned Opcode) {
7630
7631 for (SDUse &U : Value->uses()) {
7632 if (U.get() != Value)
7633 continue;
7634
7635 if (U.getUser()->getOpcode() == Opcode)
7636 return U.getUser();
7637 }
7638 return nullptr;
7639}
7640
7641unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7642 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7643 switch (Intr->getConstantOperandVal(1)) {
7644 case Intrinsic::amdgcn_if:
7645 return AMDGPUISD::IF;
7646 case Intrinsic::amdgcn_else:
7647 return AMDGPUISD::ELSE;
7648 case Intrinsic::amdgcn_loop:
7649 return AMDGPUISD::LOOP;
7650 case Intrinsic::amdgcn_end_cf:
7651 llvm_unreachable("should not occur");
7652 default:
7653 return 0;
7654 }
7655 }
7656
7657 // break, if_break, else_break are all only used as inputs to loop, not
7658 // directly as branch conditions.
7659 return 0;
7660}
7661
7668
7670 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7671 return false;
7672
7673 // FIXME: Either avoid relying on address space here or change the default
7674 // address space for functions to avoid the explicit check.
7675 return (GV->getValueType()->isFunctionTy() ||
7678}
7679
7681 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7682}
7683
7685 if (!GV->hasExternalLinkage())
7686 return true;
7687
7688 const auto OS = getTargetMachine().getTargetTriple().getOS();
7689 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7690}
7691
7692/// This transforms the control flow intrinsics to get the branch destination as
7693/// last parameter, also switches branch target with BR if the need arise
7694SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7695 SDLoc DL(BRCOND);
7696
7697 SDNode *Intr = BRCOND.getOperand(1).getNode();
7698 SDValue Target = BRCOND.getOperand(2);
7699 SDNode *BR = nullptr;
7700 SDNode *SetCC = nullptr;
7701
7702 switch (Intr->getOpcode()) {
7703 case ISD::SETCC: {
7704 // As long as we negate the condition everything is fine
7705 SetCC = Intr;
7706 Intr = SetCC->getOperand(0).getNode();
7707 break;
7708 }
7709 case ISD::XOR: {
7710 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7711 SDValue LHS = Intr->getOperand(0);
7712 SDValue RHS = Intr->getOperand(1);
7713 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7714 Intr = LHS.getNode();
7715 break;
7716 }
7717 [[fallthrough]];
7718 }
7719 default: {
7720 // Get the target from BR if we don't negate the condition
7721 BR = findUser(BRCOND, ISD::BR);
7722 assert(BR && "brcond missing unconditional branch user");
7723 Target = BR->getOperand(1);
7724 }
7725 }
7726
7727 unsigned CFNode = isCFIntrinsic(Intr);
7728 if (CFNode == 0) {
7729 // This is a uniform branch so we don't need to legalize.
7730 return BRCOND;
7731 }
7732
7733 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7735
7736 assert(!SetCC ||
7737 (SetCC->getConstantOperandVal(1) == 1 &&
7738 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7739 ISD::SETNE));
7740
7741 // operands of the new intrinsic call
7743 if (HaveChain)
7744 Ops.push_back(BRCOND.getOperand(0));
7745
7746 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7747 Ops.push_back(Target);
7748
7749 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7750
7751 // build the new intrinsic call
7752 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7753
7754 if (!HaveChain) {
7755 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7756
7758 }
7759
7760 if (BR) {
7761 // Give the branch instruction our target
7762 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7763 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7764 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7765 }
7766
7767 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7768
7769 // Copy the intrinsic results to registers
7770 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7771 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7772 if (!CopyToReg)
7773 continue;
7774
7775 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7776 SDValue(Result, i - 1), SDValue());
7777
7778 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7779 }
7780
7781 // Remove the old intrinsic from the chain
7782 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7783 Intr->getOperand(0));
7784
7785 return Chain;
7786}
7787
7788SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7789 MVT VT = Op.getSimpleValueType();
7790 SDLoc DL(Op);
7791 // Checking the depth
7792 if (Op.getConstantOperandVal(0) != 0)
7793 return DAG.getConstant(0, DL, VT);
7794
7795 MachineFunction &MF = DAG.getMachineFunction();
7796 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7797 // Check for kernel and shader functions
7798 if (Info->isEntryFunction())
7799 return DAG.getConstant(0, DL, VT);
7800
7801 MachineFrameInfo &MFI = MF.getFrameInfo();
7802 // There is a call to @llvm.returnaddress in this function
7803 MFI.setReturnAddressIsTaken(true);
7804
7805 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7806 // Get the return address reg and mark it as an implicit live-in
7807 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7808 getRegClassFor(VT, Op.getNode()->isDivergent()));
7809
7810 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7811}
7812
7813SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7814 const SDLoc &DL, EVT VT) const {
7815 return Op.getValueType().bitsLE(VT)
7816 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7817 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7818 DAG.getTargetConstant(0, DL, MVT::i32));
7819}
7820
7821SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7822 SelectionDAG &DAG) const {
7823 EVT DstVT = Op.getValueType();
7824 unsigned NumElts = DstVT.getVectorNumElements();
7825 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7826
7827 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7828
7829 SDLoc DL(Op);
7830 unsigned Opc = Op.getOpcode();
7831 SDValue Flags = Op.getOperand(1);
7832 EVT HalfDstVT =
7833 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7834 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7835 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7836
7837 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7838}
7839
7840SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7841 SDValue Src = Op.getOperand(0);
7842 EVT SrcVT = Src.getValueType();
7843 EVT DstVT = Op.getValueType();
7844
7845 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7846 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7847 if (SrcVT.getScalarType() != MVT::f32)
7848 return SDValue();
7849 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7850 }
7851
7852 if (SrcVT.getScalarType() != MVT::f64)
7853 return Op;
7854
7855 SDLoc DL(Op);
7856 if (DstVT == MVT::f16) {
7857 // TODO: Handle strictfp
7858 if (Op.getOpcode() != ISD::FP_ROUND)
7859 return Op;
7860
7861 if (!Subtarget->has16BitInsts()) {
7862 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7863 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7864 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7865 }
7866 if (Op->getFlags().hasApproximateFuncs()) {
7867 SDValue Flags = Op.getOperand(1);
7868 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7869 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7870 }
7871 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7872 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7873 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7874 }
7875
7876 assert(DstVT.getScalarType() == MVT::bf16 &&
7877 "custom lower FP_ROUND for f16 or bf16");
7878 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7879
7880 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7881 // hardware f32 -> bf16 instruction.
7882 EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
7883 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7884 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7885 DAG.getTargetConstant(0, DL, MVT::i32));
7886}
7887
7888SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7889 SelectionDAG &DAG) const {
7890 EVT VT = Op.getValueType();
7891 const MachineFunction &MF = DAG.getMachineFunction();
7892 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7893 bool IsIEEEMode = Info->getMode().IEEE;
7894
7895 // FIXME: Assert during selection that this is only selected for
7896 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7897 // mode functions, but this happens to be OK since it's only done in cases
7898 // where there is known no sNaN.
7899 if (IsIEEEMode)
7900 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7901
7902 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7903 VT == MVT::v16bf16)
7904 return splitBinaryVectorOp(Op, DAG);
7905 return Op;
7906}
7907
7908SDValue
7909SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7910 SelectionDAG &DAG) const {
7911 EVT VT = Op.getValueType();
7912 const MachineFunction &MF = DAG.getMachineFunction();
7913 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7914 bool IsIEEEMode = Info->getMode().IEEE;
7915
7916 if (IsIEEEMode)
7917 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7918
7919 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7920 VT == MVT::v16bf16)
7921 return splitBinaryVectorOp(Op, DAG);
7922 return Op;
7923}
7924
7925SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7926 SelectionDAG &DAG) const {
7927 EVT VT = Op.getValueType();
7928 if (VT.isVector())
7929 return splitBinaryVectorOp(Op, DAG);
7930
7931 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7932 !Subtarget->hasMinimum3Maximum3F16() &&
7933 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7934 "should not need to widen f16 minimum/maximum to v2f16");
7935
7936 // Widen f16 operation to v2f16
7937
7938 // fminimum f16:x, f16:y ->
7939 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7940 // (v2f16 (scalar_to_vector y))), 0
7941 SDLoc SL(Op);
7942 SDValue WideSrc0 =
7943 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7944 SDValue WideSrc1 =
7945 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7946
7947 SDValue Widened =
7948 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7949
7950 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7951 DAG.getConstant(0, SL, MVT::i32));
7952}
7953
7954SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7955 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7956 EVT VT = Op.getValueType();
7957 assert(VT == MVT::f16);
7958
7959 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7960 EVT ExpVT = Exp.getValueType();
7961 if (ExpVT == MVT::i16)
7962 return Op;
7963
7964 SDLoc DL(Op);
7965
7966 // Correct the exponent type for f16 to i16.
7967 // Clamp the range of the exponent to the instruction's range.
7968
7969 // TODO: This should be a generic narrowing legalization, and can easily be
7970 // for GlobalISel.
7971
7972 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7973 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7974
7975 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7976 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7977
7978 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7979
7980 if (IsStrict) {
7981 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7982 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7983 }
7984
7985 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7986}
7987
7989 switch (Op->getOpcode()) {
7990 case ISD::SRA:
7991 case ISD::SMIN:
7992 case ISD::SMAX:
7993 return ISD::SIGN_EXTEND;
7994 case ISD::SRL:
7995 case ISD::UMIN:
7996 case ISD::UMAX:
7997 return ISD::ZERO_EXTEND;
7998 case ISD::ADD:
7999 case ISD::SUB:
8000 case ISD::AND:
8001 case ISD::OR:
8002 case ISD::XOR:
8003 case ISD::SHL:
8004 case ISD::SELECT:
8005 case ISD::MUL:
8006 // operation result won't be influenced by garbage high bits.
8007 // TODO: are all of those cases correct, and are there more?
8008 return ISD::ANY_EXTEND;
8009 case ISD::SETCC: {
8010 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8012 }
8013 default:
8014 llvm_unreachable("unexpected opcode!");
8015 }
8016}
8017
8018SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8019 DAGCombinerInfo &DCI) const {
8020 const unsigned Opc = Op.getOpcode();
8021 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8022 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8023 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8024 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8025 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8026
8027 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8028 : Op->getOperand(0).getValueType();
8029 auto &DAG = DCI.DAG;
8030 auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8031
8032 if (DCI.isBeforeLegalizeOps() ||
8033 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8034 return SDValue();
8035
8036 SDLoc DL(Op);
8037 SDValue LHS;
8038 SDValue RHS;
8039 if (Opc == ISD::SELECT) {
8040 LHS = Op->getOperand(1);
8041 RHS = Op->getOperand(2);
8042 } else {
8043 LHS = Op->getOperand(0);
8044 RHS = Op->getOperand(1);
8045 }
8046
8047 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8048 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
8049
8050 // Special case: for shifts, the RHS always needs a zext.
8051 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8052 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
8053 else
8054 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
8055
8056 // setcc always return i1/i1 vec so no need to truncate after.
8057 if (Opc == ISD::SETCC) {
8058 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8059 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8060 }
8061
8062 // For other ops, we extend the operation's return type as well so we need to
8063 // truncate back to the original type.
8064 SDValue NewVal;
8065 if (Opc == ISD::SELECT)
8066 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8067 else
8068 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8069
8070 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8071}
8072
8073SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8074 SDValue Mag = Op.getOperand(0);
8075 EVT MagVT = Mag.getValueType();
8076
8077 if (MagVT.getVectorNumElements() > 2)
8078 return splitBinaryVectorOp(Op, DAG);
8079
8080 SDValue Sign = Op.getOperand(1);
8081 EVT SignVT = Sign.getValueType();
8082
8083 if (MagVT == SignVT)
8084 return Op;
8085
8086 // fcopysign v2f16:mag, v2f32:sign ->
8087 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8088
8089 SDLoc SL(Op);
8090 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8091 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8092
8093 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8094
8095 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8096}
8097
8098// Custom lowering for vector multiplications and s_mul_u64.
8099SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8100 EVT VT = Op.getValueType();
8101
8102 // Split vector operands.
8103 if (VT.isVector())
8104 return splitBinaryVectorOp(Op, DAG);
8105
8106 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8107
8108 // There are four ways to lower s_mul_u64:
8109 //
8110 // 1. If all the operands are uniform, then we lower it as it is.
8111 //
8112 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8113 // multiplications because there is not a vector equivalent of s_mul_u64.
8114 //
8115 // 3. If the cost model decides that it is more efficient to use vector
8116 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8117 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8118 //
8119 // 4. If the cost model decides to use vector registers and both of the
8120 // operands are zero-extended/sign-extended from 32-bits, then we split the
8121 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8122 // possible to check if the operands are zero-extended or sign-extended in
8123 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8124 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8125 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8126 // If the cost model decides that we have to use vector registers, then
8127 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8128 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8129 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8130 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8131 // SIInstrInfo.cpp .
8132
8133 if (Op->isDivergent())
8134 return SDValue();
8135
8136 SDValue Op0 = Op.getOperand(0);
8137 SDValue Op1 = Op.getOperand(1);
8138 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8139 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8140 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8141 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8142 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8143 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8144 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8145 SDLoc SL(Op);
8146 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8147 return SDValue(
8148 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8149 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8150 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8151 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8152 return SDValue(
8153 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8154 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8155 return Op;
8156}
8157
8158SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8159 EVT VT = Op.getValueType();
8160 SDLoc SL(Op);
8161 SDValue LHS = Op.getOperand(0);
8162 SDValue RHS = Op.getOperand(1);
8163 bool isSigned = Op.getOpcode() == ISD::SMULO;
8164
8165 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8166 const APInt &C = RHSC->getAPIntValue();
8167 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8168 if (C.isPowerOf2()) {
8169 // smulo(x, signed_min) is same as umulo(x, signed_min).
8170 bool UseArithShift = isSigned && !C.isMinSignedValue();
8171 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8172 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8173 SDValue Overflow =
8174 DAG.getSetCC(SL, MVT::i1,
8175 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8176 Result, ShiftAmt),
8177 LHS, ISD::SETNE);
8178 return DAG.getMergeValues({Result, Overflow}, SL);
8179 }
8180 }
8181
8182 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8183 SDValue Top =
8184 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8185
8186 SDValue Sign = isSigned
8187 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8188 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8189 SL, MVT::i32))
8190 : DAG.getConstant(0, SL, VT);
8191 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8192
8193 return DAG.getMergeValues({Result, Overflow}, SL);
8194}
8195
8196SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8197 if (Op->isDivergent()) {
8198 // Select to V_MAD_[IU]64_[IU]32.
8199 return Op;
8200 }
8201 if (Subtarget->hasSMulHi()) {
8202 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8203 return SDValue();
8204 }
8205 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8206 // calculate the high part, so we might as well do the whole thing with
8207 // V_MAD_[IU]64_[IU]32.
8208 return Op;
8209}
8210
8211SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8212 if (!Subtarget->isTrapHandlerEnabled() ||
8213 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8214 return lowerTrapEndpgm(Op, DAG);
8215
8216 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8217 : lowerTrapHsaQueuePtr(Op, DAG);
8218}
8219
8220SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8221 SDLoc SL(Op);
8222 SDValue Chain = Op.getOperand(0);
8223 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8224}
8225
8226SDValue
8227SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8228 const SDLoc &DL, Align Alignment,
8229 ImplicitParameter Param) const {
8230 MachineFunction &MF = DAG.getMachineFunction();
8231 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8232 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8233 MachinePointerInfo PtrInfo =
8235 return DAG.getLoad(
8236 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8238}
8239
8240SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8241 SelectionDAG &DAG) const {
8242 SDLoc SL(Op);
8243 SDValue Chain = Op.getOperand(0);
8244
8245 SDValue QueuePtr;
8246 // For code object version 5, QueuePtr is passed through implicit kernarg.
8247 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8249 QueuePtr =
8250 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8251 } else {
8252 MachineFunction &MF = DAG.getMachineFunction();
8253 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8254 Register UserSGPR = Info->getQueuePtrUserSGPR();
8255
8256 if (UserSGPR == AMDGPU::NoRegister) {
8257 // We probably are in a function incorrectly marked with
8258 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8259 // trap, so just use a null pointer.
8260 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8261 } else {
8262 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8263 MVT::i64);
8264 }
8265 }
8266
8267 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8268 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8269
8270 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8271 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8272 ToReg.getValue(1)};
8273 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8274}
8275
8276SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8277 SDLoc SL(Op);
8278 SDValue Chain = Op.getOperand(0);
8279
8280 // We need to simulate the 's_trap 2' instruction on targets that run in
8281 // PRIV=1 (where it is treated as a nop).
8282 if (Subtarget->hasPrivEnabledTrap2NopBug())
8283 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8284
8285 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8286 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8287 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8288}
8289
8290SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8291 SDLoc SL(Op);
8292 SDValue Chain = Op.getOperand(0);
8293 MachineFunction &MF = DAG.getMachineFunction();
8294
8295 if (!Subtarget->isTrapHandlerEnabled() ||
8296 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8297 LLVMContext &Ctx = MF.getFunction().getContext();
8298 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8299 "debugtrap handler not supported",
8300 Op.getDebugLoc(), DS_Warning));
8301 return Chain;
8302 }
8303
8304 uint64_t TrapID =
8305 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8306 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8307 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8308}
8309
8310SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8311 SelectionDAG &DAG) const {
8312 if (Subtarget->hasApertureRegs()) {
8313 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8314 ? AMDGPU::SRC_SHARED_BASE
8315 : AMDGPU::SRC_PRIVATE_BASE;
8316 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8317 !Subtarget->hasGloballyAddressableScratch()) &&
8318 "Cannot use src_private_base with globally addressable scratch!");
8319 // Note: this feature (register) is broken. When used as a 32-bit operand,
8320 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8321 // bits.
8322 //
8323 // To work around the issue, emit a 64 bit copy from this register
8324 // then extract the high bits. Note that this shouldn't even result in a
8325 // shift being emitted and simply become a pair of registers (e.g.):
8326 // s_mov_b64 s[6:7], src_shared_base
8327 // v_mov_b32_e32 v1, s7
8328 SDValue Copy =
8329 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8330 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8331 }
8332
8333 // For code object version 5, private_base and shared_base are passed through
8334 // implicit kernargs.
8335 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8339 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8340 }
8341
8342 MachineFunction &MF = DAG.getMachineFunction();
8343 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8344 Register UserSGPR = Info->getQueuePtrUserSGPR();
8345 if (UserSGPR == AMDGPU::NoRegister) {
8346 // We probably are in a function incorrectly marked with
8347 // amdgpu-no-queue-ptr. This is undefined.
8348 return DAG.getPOISON(MVT::i32);
8349 }
8350
8351 SDValue QueuePtr =
8352 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8353
8354 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8355 // private_segment_aperture_base_hi.
8356 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8357
8358 SDValue Ptr =
8359 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8360
8361 // TODO: Use custom target PseudoSourceValue.
8362 // TODO: We should use the value from the IR intrinsic call, but it might not
8363 // be available and how do we get it?
8364 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8365 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8366 commonAlignment(Align(64), StructOffset),
8369}
8370
8371/// Return true if the value is a known valid address, such that a null check is
8372/// not necessary.
8374 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8376 return true;
8377
8378 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8379 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8380
8381 // TODO: Search through arithmetic, handle arguments and loads
8382 // marked nonnull.
8383 return false;
8384}
8385
8386SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8387 SelectionDAG &DAG) const {
8388 SDLoc SL(Op);
8389
8390 const AMDGPUTargetMachine &TM =
8391 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8392
8393 unsigned DestAS, SrcAS;
8394 SDValue Src;
8395 bool IsNonNull = false;
8396 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8397 SrcAS = ASC->getSrcAddressSpace();
8398 Src = ASC->getOperand(0);
8399 DestAS = ASC->getDestAddressSpace();
8400 } else {
8401 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8402 Op.getConstantOperandVal(0) ==
8403 Intrinsic::amdgcn_addrspacecast_nonnull);
8404 Src = Op->getOperand(1);
8405 SrcAS = Op->getConstantOperandVal(2);
8406 DestAS = Op->getConstantOperandVal(3);
8407 IsNonNull = true;
8408 }
8409
8410 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8411
8412 // flat -> local/private
8413 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8414 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8415 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8416 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8417
8418 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8419 Subtarget->hasGloballyAddressableScratch()) {
8420 // flat -> private with globally addressable scratch: subtract
8421 // src_flat_scratch_base_lo.
8422 SDValue FlatScratchBaseLo(
8423 DAG.getMachineNode(
8424 AMDGPU::S_MOV_B32, SL, MVT::i32,
8425 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8426 0);
8427 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8428 }
8429
8430 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8431 return Ptr;
8432
8433 unsigned NullVal = TM.getNullPointerValue(DestAS);
8434 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8435 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8436
8437 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8438 SegmentNullPtr);
8439 }
8440 }
8441
8442 // local/private -> flat
8443 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8444 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8445 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8446 SDValue CvtPtr;
8447 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8448 Subtarget->hasGloballyAddressableScratch()) {
8449 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8450 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8451 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8452 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8453 ThreadID = DAG.getNode(
8454 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8455 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8456 AllOnes, ThreadID);
8457 if (Subtarget->isWave64())
8458 ThreadID = DAG.getNode(
8459 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8460 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8461 AllOnes, ThreadID);
8462 SDValue ShAmt = DAG.getShiftAmountConstant(
8463 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8464 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8465 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8466 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8467 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8468 // 64-bit hi:lo value.
8469 SDValue FlatScratchBase = {
8470 DAG.getMachineNode(
8471 AMDGPU::S_MOV_B64, SL, MVT::i64,
8472 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8473 0};
8474 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8475 } else {
8476 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8477 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8478 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8479 }
8480
8481 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8482 return CvtPtr;
8483
8484 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8485 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8486
8487 SDValue NonNull =
8488 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8489
8490 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8491 FlatNullPtr);
8492 }
8493 }
8494
8495 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8496 Op.getValueType() == MVT::i64) {
8497 const SIMachineFunctionInfo *Info =
8498 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8499 if (Info->get32BitAddressHighBits() == 0)
8500 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
8501
8502 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8503 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8504 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8505 }
8506
8507 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8508 Src.getValueType() == MVT::i64)
8509 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8510
8511 // global <-> flat are no-ops and never emitted.
8512
8513 // Invalid casts are poison.
8514 return DAG.getPOISON(Op->getValueType(0));
8515}
8516
8517// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8518// the small vector and inserting them into the big vector. That is better than
8519// the default expansion of doing it via a stack slot. Even though the use of
8520// the stack slot would be optimized away afterwards, the stack slot itself
8521// remains.
8522SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8523 SelectionDAG &DAG) const {
8524 SDValue Vec = Op.getOperand(0);
8525 SDValue Ins = Op.getOperand(1);
8526 SDValue Idx = Op.getOperand(2);
8527 EVT VecVT = Vec.getValueType();
8528 EVT InsVT = Ins.getValueType();
8529 EVT EltVT = VecVT.getVectorElementType();
8530 unsigned InsNumElts = InsVT.getVectorNumElements();
8531 unsigned IdxVal = Idx->getAsZExtVal();
8532 SDLoc SL(Op);
8533
8534 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8535 // Insert 32-bit registers at a time.
8536 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8537
8538 unsigned VecNumElts = VecVT.getVectorNumElements();
8539 EVT NewVecVT =
8540 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8541 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8543 MVT::i32, InsNumElts / 2);
8544
8545 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8546 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8547
8548 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8549 SDValue Elt;
8550 if (InsNumElts == 2) {
8551 Elt = Ins;
8552 } else {
8553 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8554 DAG.getConstant(I, SL, MVT::i32));
8555 }
8556 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8557 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8558 }
8559
8560 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8561 }
8562
8563 for (unsigned I = 0; I != InsNumElts; ++I) {
8564 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8565 DAG.getConstant(I, SL, MVT::i32));
8566 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8567 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8568 }
8569 return Vec;
8570}
8571
8572SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8573 SelectionDAG &DAG) const {
8574 SDValue Vec = Op.getOperand(0);
8575 SDValue InsVal = Op.getOperand(1);
8576 SDValue Idx = Op.getOperand(2);
8577 EVT VecVT = Vec.getValueType();
8578 EVT EltVT = VecVT.getVectorElementType();
8579 unsigned VecSize = VecVT.getSizeInBits();
8580 unsigned EltSize = EltVT.getSizeInBits();
8581 SDLoc SL(Op);
8582
8583 // Specially handle the case of v4i16 with static indexing.
8584 unsigned NumElts = VecVT.getVectorNumElements();
8585 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8586 if (NumElts == 4 && EltSize == 16 && KIdx) {
8587 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8588
8589 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8590 DAG.getConstant(0, SL, MVT::i32));
8591 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8592 DAG.getConstant(1, SL, MVT::i32));
8593
8594 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8595 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8596
8597 unsigned Idx = KIdx->getZExtValue();
8598 bool InsertLo = Idx < 2;
8599 SDValue InsHalf = DAG.getNode(
8600 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8601 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8602 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8603
8604 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8605
8606 SDValue Concat =
8607 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8608 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8609
8610 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8611 }
8612
8613 // Static indexing does not lower to stack access, and hence there is no need
8614 // for special custom lowering to avoid stack access.
8615 if (isa<ConstantSDNode>(Idx))
8616 return SDValue();
8617
8618 // Avoid stack access for dynamic indexing by custom lowering to
8619 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8620
8621 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8622
8623 MVT IntVT = MVT::getIntegerVT(VecSize);
8624
8625 // Convert vector index to bit-index and get the required bit mask.
8626 assert(isPowerOf2_32(EltSize));
8627 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8628 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8629 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8630 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8631 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8632
8633 // 1. Create a congruent vector with the target value in each element.
8634 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8635 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8636
8637 // 2. Mask off all other indices except the required index within (1).
8638 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8639
8640 // 3. Mask off the required index within the target vector.
8641 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8642 SDValue RHS =
8643 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8644
8645 // 4. Get (2) and (3) ORed into the target vector.
8646 SDValue BFI =
8647 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8648
8649 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8650}
8651
8652SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8653 SelectionDAG &DAG) const {
8654 SDLoc SL(Op);
8655
8656 EVT ResultVT = Op.getValueType();
8657 SDValue Vec = Op.getOperand(0);
8658 SDValue Idx = Op.getOperand(1);
8659 EVT VecVT = Vec.getValueType();
8660 unsigned VecSize = VecVT.getSizeInBits();
8661 EVT EltVT = VecVT.getVectorElementType();
8662
8663 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8664
8665 // Make sure we do any optimizations that will make it easier to fold
8666 // source modifiers before obscuring it with bit operations.
8667
8668 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8669 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8670 return Combined;
8671
8672 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8673 SDValue Lo, Hi;
8674 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8675
8676 if (VecSize == 128) {
8677 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8678 Lo = DAG.getBitcast(LoVT,
8679 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8680 DAG.getConstant(0, SL, MVT::i32)));
8681 Hi = DAG.getBitcast(HiVT,
8682 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8683 DAG.getConstant(1, SL, MVT::i32)));
8684 } else if (VecSize == 256) {
8685 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8686 SDValue Parts[4];
8687 for (unsigned P = 0; P < 4; ++P) {
8688 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8689 DAG.getConstant(P, SL, MVT::i32));
8690 }
8691
8692 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8693 Parts[0], Parts[1]));
8694 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8695 Parts[2], Parts[3]));
8696 } else {
8697 assert(VecSize == 512);
8698
8699 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8700 SDValue Parts[8];
8701 for (unsigned P = 0; P < 8; ++P) {
8702 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8703 DAG.getConstant(P, SL, MVT::i32));
8704 }
8705
8706 Lo = DAG.getBitcast(LoVT,
8707 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8708 Parts[0], Parts[1], Parts[2], Parts[3]));
8709 Hi = DAG.getBitcast(HiVT,
8710 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8711 Parts[4], Parts[5], Parts[6], Parts[7]));
8712 }
8713
8714 EVT IdxVT = Idx.getValueType();
8715 unsigned NElem = VecVT.getVectorNumElements();
8716 assert(isPowerOf2_32(NElem));
8717 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8718 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8719 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8720 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8721 }
8722
8723 assert(VecSize <= 64);
8724
8725 MVT IntVT = MVT::getIntegerVT(VecSize);
8726
8727 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8728 SDValue VecBC = peekThroughBitcasts(Vec);
8729 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8730 SDValue Src = VecBC.getOperand(0);
8731 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8732 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8733 }
8734
8735 unsigned EltSize = EltVT.getSizeInBits();
8736 assert(isPowerOf2_32(EltSize));
8737
8738 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8739
8740 // Convert vector index to bit-index (* EltSize)
8741 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8742
8743 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8744 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8745
8746 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8747 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8748 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8749 }
8750
8751 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8752}
8753
8754static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8755 assert(Elt % 2 == 0);
8756 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8757}
8758
8759static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8760 assert(Elt % 2 == 0);
8761 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8762 !(Mask[Elt + 1] & 1);
8763}
8764
8765SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8766 SelectionDAG &DAG) const {
8767 SDLoc SL(Op);
8768 EVT ResultVT = Op.getValueType();
8769 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8770 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8771 const int NewSrcNumElts = 2;
8772 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8773 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8774
8775 // Break up the shuffle into registers sized pieces.
8776 //
8777 // We're trying to form sub-shuffles that the register allocation pipeline
8778 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8779 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8780 // pair of copies into a consecutive register copy, so use the ordinary
8781 // extract_vector_elt lowering unless we can use the shuffle.
8782 //
8783 // TODO: This is a bit of hack, and we should probably always use
8784 // extract_subvector for the largest possible subvector we can (or at least
8785 // use it for PackVT aligned pieces). However we have worse support for
8786 // combines on them don't directly treat extract_subvector / insert_subvector
8787 // as legal. The DAG scheduler also ends up doing a worse job with the
8788 // extract_subvectors.
8789 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8790
8791 // vector_shuffle <0,1,6,7> lhs, rhs
8792 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8793 //
8794 // vector_shuffle <6,7,2,3> lhs, rhs
8795 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8796 //
8797 // vector_shuffle <6,7,0,1> lhs, rhs
8798 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8799
8800 // Avoid scalarizing when both halves are reading from consecutive elements.
8801
8802 // If we're treating 2 element shuffles as legal, also create odd-to-even
8803 // shuffles of neighboring pairs.
8804 //
8805 // vector_shuffle <3,2,7,6> lhs, rhs
8806 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8807 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8808
8810 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8811 if (ShouldUseConsecutiveExtract &&
8813 const int Idx = SVN->getMaskElt(I);
8814 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8815 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8816 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8817 SVN->getOperand(VecIdx),
8818 DAG.getConstant(EltIdx, SL, MVT::i32));
8819 Pieces.push_back(SubVec);
8820 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8822 int Idx0 = SVN->getMaskElt(I);
8823 int Idx1 = SVN->getMaskElt(I + 1);
8824
8825 SDValue SrcOp0 = SVN->getOperand(0);
8826 SDValue SrcOp1 = SrcOp0;
8827 if (Idx0 >= SrcNumElts) {
8828 SrcOp0 = SVN->getOperand(1);
8829 Idx0 -= SrcNumElts;
8830 }
8831
8832 if (Idx1 >= SrcNumElts) {
8833 SrcOp1 = SVN->getOperand(1);
8834 Idx1 -= SrcNumElts;
8835 }
8836
8837 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8838 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8839
8840 // Extract nearest even aligned piece.
8841 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8842 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8843 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8844 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8845
8846 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8847 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8848
8849 SDValue Result0 = SubVec0;
8850 SDValue Result1 = SubVec0;
8851
8852 if (SubVec0 != SubVec1) {
8853 NewMaskIdx1 += NewSrcNumElts;
8854 Result1 = SubVec1;
8855 } else {
8856 Result1 = DAG.getPOISON(PackVT);
8857 }
8858
8859 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8860 {NewMaskIdx0, NewMaskIdx1});
8861 Pieces.push_back(Shuf);
8862 } else {
8863 const int Idx0 = SVN->getMaskElt(I);
8864 const int Idx1 = SVN->getMaskElt(I + 1);
8865 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8866 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8867 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8868 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8869
8870 SDValue Vec0 = SVN->getOperand(VecIdx0);
8871 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8872 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8873
8874 SDValue Vec1 = SVN->getOperand(VecIdx1);
8875 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8876 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8877 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8878 }
8879 }
8880
8881 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8882}
8883
8884SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8885 SelectionDAG &DAG) const {
8886 SDValue SVal = Op.getOperand(0);
8887 EVT ResultVT = Op.getValueType();
8888 EVT SValVT = SVal.getValueType();
8889 SDValue UndefVal = DAG.getPOISON(SValVT);
8890 SDLoc SL(Op);
8891
8893 VElts.push_back(SVal);
8894 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8895 VElts.push_back(UndefVal);
8896
8897 return DAG.getBuildVector(ResultVT, SL, VElts);
8898}
8899
8900SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8901 SelectionDAG &DAG) const {
8902 SDLoc SL(Op);
8903 EVT VT = Op.getValueType();
8904
8905 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8906 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8907
8908 SDValue Lo = Op.getOperand(0);
8909 SDValue Hi = Op.getOperand(1);
8910
8911 // Avoid adding defined bits with the zero_extend.
8912 if (Hi.isUndef()) {
8913 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8914 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8915 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8916 }
8917
8918 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8919 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8920
8921 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8922 DAG.getConstant(16, SL, MVT::i32));
8923 if (Lo.isUndef())
8924 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8925
8926 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8927 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8928
8929 SDValue Or =
8930 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8931 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8932 }
8933
8934 // Split into 2-element chunks.
8935 const unsigned NumParts = VT.getVectorNumElements() / 2;
8936 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8937 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8938
8940 for (unsigned P = 0; P < NumParts; ++P) {
8941 SDValue Vec = DAG.getBuildVector(
8942 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8943 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8944 }
8945
8946 SDValue Blend =
8947 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8948 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8949}
8950
8952 const GlobalAddressSDNode *GA) const {
8953 // OSes that use ELF REL relocations (instead of RELA) can only store a
8954 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8955 // which can create arbitrary 64-bit addends. (This is only a problem for
8956 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8957 // the high 32 bits of the addend.)
8958 //
8959 // This should be kept in sync with how HasRelocationAddend is initialized in
8960 // the constructor of ELFAMDGPUAsmBackend.
8961 if (!Subtarget->isAmdHsaOS())
8962 return false;
8963
8964 // We can fold offsets for anything that doesn't require a GOT relocation.
8965 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8969}
8970
8971static SDValue
8973 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8974 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8975 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8976 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8977 // lowered to the following code sequence:
8978 //
8979 // For constant address space:
8980 // s_getpc_b64 s[0:1]
8981 // s_add_u32 s0, s0, $symbol
8982 // s_addc_u32 s1, s1, 0
8983 //
8984 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8985 // a fixup or relocation is emitted to replace $symbol with a literal
8986 // constant, which is a pc-relative offset from the encoding of the $symbol
8987 // operand to the global variable.
8988 //
8989 // For global address space:
8990 // s_getpc_b64 s[0:1]
8991 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8992 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8993 //
8994 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8995 // fixups or relocations are emitted to replace $symbol@*@lo and
8996 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8997 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8998 // operand to the global variable.
8999 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9000 assert(GAFlags != SIInstrInfo::MO_NONE);
9001
9002 SDValue Ptr =
9003 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
9004 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
9005 }
9006
9007 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
9008 SDValue PtrHi;
9009 if (GAFlags == SIInstrInfo::MO_NONE)
9010 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
9011 else
9012 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
9013 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
9014}
9015
9016SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
9017 SDValue Op,
9018 SelectionDAG &DAG) const {
9019 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
9020 SDLoc DL(GSD);
9021 EVT PtrVT = Op.getValueType();
9022
9023 const GlobalValue *GV = GSD->getGlobal();
9029 GV->hasExternalLinkage()) {
9030 Type *Ty = GV->getValueType();
9031 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9032 // zero-sized type in other languages to declare the dynamic shared
9033 // memory which size is not known at the compile time. They will be
9034 // allocated by the runtime and placed directly after the static
9035 // allocated ones. They all share the same offset.
9036 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
9037 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9038 // Adjust alignment for that dynamic shared memory array.
9041 MFI->setUsesDynamicLDS(true);
9042 return SDValue(
9043 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
9044 }
9045 }
9047 }
9048
9050 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
9052 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
9053 }
9054
9055 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9056 if (Subtarget->has64BitLiterals()) {
9058 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9059 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9060 0);
9061 }
9062
9063 SDValue AddrLo = DAG.getTargetGlobalAddress(
9064 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9065 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9066
9067 SDValue AddrHi = DAG.getTargetGlobalAddress(
9068 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9069 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9070
9071 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9072 }
9073
9074 if (shouldEmitFixup(GV))
9075 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9076
9077 if (shouldEmitPCReloc(GV))
9078 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9080
9081 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9083 PointerType *PtrTy =
9085 const DataLayout &DataLayout = DAG.getDataLayout();
9086 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9087 MachinePointerInfo PtrInfo =
9089
9090 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9093}
9094
9095SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9096 SelectionDAG &DAG) const {
9097 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9098 const Function &Fn = DAG.getMachineFunction().getFunction();
9099 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9100 Fn, "unsupported external symbol", Op.getDebugLoc()));
9101 return DAG.getPOISON(Op.getValueType());
9102}
9103
9105 const SDLoc &DL, SDValue V) const {
9106 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9107 // the destination register.
9108 //
9109 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9110 // so we will end up with redundant moves to m0.
9111 //
9112 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9113
9114 // A Null SDValue creates a glue result.
9115 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9116 V, Chain);
9117 return SDValue(M0, 0);
9118}
9119
9120SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9121 MVT VT,
9122 unsigned Offset) const {
9123 SDLoc SL(Op);
9124 SDValue Param = lowerKernargMemParameter(
9125 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9126 // The local size values will have the hi 16-bits as zero.
9127 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9128 DAG.getValueType(VT));
9129}
9130
9132 EVT VT) {
9135 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9136 return DAG.getPOISON(VT);
9137}
9138
9140 EVT VT) {
9143 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9144 return DAG.getPOISON(VT);
9145}
9146
9148 ArrayRef<SDValue> Elts) {
9149 assert(!Elts.empty());
9150 MVT Type;
9151 unsigned NumElts = Elts.size();
9152
9153 if (NumElts <= 12) {
9154 Type = MVT::getVectorVT(MVT::f32, NumElts);
9155 } else {
9156 assert(Elts.size() <= 16);
9157 Type = MVT::v16f32;
9158 NumElts = 16;
9159 }
9160
9161 SmallVector<SDValue, 16> VecElts(NumElts);
9162 for (unsigned i = 0; i < Elts.size(); ++i) {
9163 SDValue Elt = Elts[i];
9164 if (Elt.getValueType() != MVT::f32)
9165 Elt = DAG.getBitcast(MVT::f32, Elt);
9166 VecElts[i] = Elt;
9167 }
9168 for (unsigned i = Elts.size(); i < NumElts; ++i)
9169 VecElts[i] = DAG.getPOISON(MVT::f32);
9170
9171 if (NumElts == 1)
9172 return VecElts[0];
9173 return DAG.getBuildVector(Type, DL, VecElts);
9174}
9175
9176static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9177 SDValue Src, int ExtraElts) {
9178 EVT SrcVT = Src.getValueType();
9179
9181
9182 if (SrcVT.isVector())
9183 DAG.ExtractVectorElements(Src, Elts);
9184 else
9185 Elts.push_back(Src);
9186
9187 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9188 while (ExtraElts--)
9189 Elts.push_back(Undef);
9190
9191 return DAG.getBuildVector(CastVT, DL, Elts);
9192}
9193
9194// Re-construct the required return value for a image load intrinsic.
9195// This is more complicated due to the optional use TexFailCtrl which means the
9196// required return type is an aggregate
9198 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9199 bool Unpacked, bool IsD16, int DMaskPop,
9200 int NumVDataDwords, bool IsAtomicPacked16Bit,
9201 const SDLoc &DL) {
9202 // Determine the required return type. This is the same regardless of
9203 // IsTexFail flag
9204 EVT ReqRetVT = ResultTypes[0];
9205 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9206 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9207 ? (ReqRetNumElts + 1) / 2
9208 : ReqRetNumElts;
9209
9210 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9211
9212 MVT DataDwordVT =
9213 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9214
9215 MVT MaskPopVT =
9216 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9217
9218 SDValue Data(Result, 0);
9219 SDValue TexFail;
9220
9221 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9222 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9223 if (MaskPopVT.isVector()) {
9224 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9225 SDValue(Result, 0), ZeroIdx);
9226 } else {
9227 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9228 SDValue(Result, 0), ZeroIdx);
9229 }
9230 }
9231
9232 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9233 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9234 NumDataDwords - MaskPopDwords);
9235
9236 if (IsD16)
9237 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9238
9239 EVT LegalReqRetVT = ReqRetVT;
9240 if (!ReqRetVT.isVector()) {
9241 if (!Data.getValueType().isInteger())
9242 Data = DAG.getNode(ISD::BITCAST, DL,
9243 Data.getValueType().changeTypeToInteger(), Data);
9244 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9245 } else {
9246 // We need to widen the return vector to a legal type
9247 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9248 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9249 LegalReqRetVT =
9251 ReqRetVT.getVectorNumElements() + 1);
9252 }
9253 }
9254 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9255
9256 if (IsTexFail) {
9257 TexFail =
9258 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9259 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9260
9261 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9262 }
9263
9264 if (Result->getNumValues() == 1)
9265 return Data;
9266
9267 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9268}
9269
9270static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9271 SDValue *LWE, bool &IsTexFail) {
9272 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9273
9274 uint64_t Value = TexFailCtrlConst->getZExtValue();
9275 if (Value) {
9276 IsTexFail = true;
9277 }
9278
9279 SDLoc DL(TexFailCtrlConst);
9280 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9281 Value &= ~(uint64_t)0x1;
9282 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9283 Value &= ~(uint64_t)0x2;
9284
9285 return Value == 0;
9286}
9287
9289 MVT PackVectorVT,
9290 SmallVectorImpl<SDValue> &PackedAddrs,
9291 unsigned DimIdx, unsigned EndIdx,
9292 unsigned NumGradients) {
9293 SDLoc DL(Op);
9294 for (unsigned I = DimIdx; I < EndIdx; I++) {
9295 SDValue Addr = Op.getOperand(I);
9296
9297 // Gradients are packed with undef for each coordinate.
9298 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9299 // 1D: undef,dx/dh; undef,dx/dv
9300 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9301 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9302 if (((I + 1) >= EndIdx) ||
9303 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9304 I == DimIdx + NumGradients - 1))) {
9305 if (Addr.getValueType() != MVT::i16)
9306 Addr = DAG.getBitcast(MVT::i16, Addr);
9307 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9308 } else {
9309 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9310 I++;
9311 }
9312 Addr = DAG.getBitcast(MVT::f32, Addr);
9313 PackedAddrs.push_back(Addr);
9314 }
9315}
9316
9317SDValue SITargetLowering::lowerImage(SDValue Op,
9319 SelectionDAG &DAG, bool WithChain) const {
9320 SDLoc DL(Op);
9321 MachineFunction &MF = DAG.getMachineFunction();
9322 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9323 unsigned IntrOpcode = Intr->BaseOpcode;
9324 // For image atomic: use no-return opcode if result is unused.
9325 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9326 !Op.getNode()->hasAnyUseOfValue(0))
9327 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9328 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9330 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9331 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9332 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9333 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9334
9335 SmallVector<EVT, 3> ResultTypes(Op->values());
9336 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9337 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9338 ResultTypes.erase(&ResultTypes[0]);
9339
9340 bool IsD16 = false;
9341 bool IsG16 = false;
9342 bool IsA16 = false;
9343 SDValue VData;
9344 int NumVDataDwords = 0;
9345 bool AdjustRetType = false;
9346 bool IsAtomicPacked16Bit = false;
9347
9348 // Offset of intrinsic arguments
9349 const unsigned ArgOffset = WithChain ? 2 : 1;
9350
9351 unsigned DMask;
9352 unsigned DMaskLanes = 0;
9353
9354 if (BaseOpcode->Atomic) {
9355 VData = Op.getOperand(2);
9356
9357 IsAtomicPacked16Bit =
9358 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9359 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9360 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9361 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9362
9363 bool Is64Bit = VData.getValueSizeInBits() == 64;
9364 if (BaseOpcode->AtomicX2) {
9365 SDValue VData2 = Op.getOperand(3);
9366 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9367 {VData, VData2});
9368 if (Is64Bit)
9369 VData = DAG.getBitcast(MVT::v4i32, VData);
9370
9371 if (!BaseOpcode->NoReturn)
9372 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9373
9374 DMask = Is64Bit ? 0xf : 0x3;
9375 NumVDataDwords = Is64Bit ? 4 : 2;
9376 } else {
9377 DMask = Is64Bit ? 0x3 : 0x1;
9378 NumVDataDwords = Is64Bit ? 2 : 1;
9379 }
9380 } else {
9381 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9382 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9383
9384 if (BaseOpcode->Store) {
9385 VData = Op.getOperand(2);
9386
9387 MVT StoreVT = VData.getSimpleValueType();
9388 if (StoreVT.getScalarType() == MVT::f16) {
9389 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9390 return Op; // D16 is unsupported for this instruction
9391
9392 IsD16 = true;
9393 VData = handleD16VData(VData, DAG, true);
9394 }
9395
9396 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9397 } else if (!BaseOpcode->NoReturn) {
9398 // Work out the num dwords based on the dmask popcount and underlying type
9399 // and whether packing is supported.
9400 MVT LoadVT = ResultTypes[0].getSimpleVT();
9401 if (LoadVT.getScalarType() == MVT::f16) {
9402 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9403 return Op; // D16 is unsupported for this instruction
9404
9405 IsD16 = true;
9406 }
9407
9408 // Confirm that the return type is large enough for the dmask specified
9409 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9410 (!LoadVT.isVector() && DMaskLanes > 1))
9411 return Op;
9412
9413 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9414 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9415 // instructions.
9416 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9417 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9418 NumVDataDwords = (DMaskLanes + 1) / 2;
9419 else
9420 NumVDataDwords = DMaskLanes;
9421
9422 AdjustRetType = true;
9423 }
9424 }
9425
9426 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9428
9429 // Check for 16 bit addresses or derivatives and pack if true.
9430 MVT VAddrVT =
9431 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9432 MVT VAddrScalarVT = VAddrVT.getScalarType();
9433 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9434 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9435
9436 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9437 VAddrScalarVT = VAddrVT.getScalarType();
9438 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9439 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9440
9441 // Push back extra arguments.
9442 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9443 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9444 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9445 // Special handling of bias when A16 is on. Bias is of type half but
9446 // occupies full 32-bit.
9447 SDValue Bias = DAG.getBuildVector(
9448 MVT::v2f16, DL,
9449 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9450 VAddrs.push_back(Bias);
9451 } else {
9452 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9453 "Bias needs to be converted to 16 bit in A16 mode");
9454 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9455 }
9456 }
9457
9458 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9459 // 16 bit gradients are supported, but are tied to the A16 control
9460 // so both gradients and addresses must be 16 bit
9461 LLVM_DEBUG(
9462 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9463 "require 16 bit args for both gradients and addresses");
9464 return Op;
9465 }
9466
9467 if (IsA16) {
9468 if (!ST->hasA16()) {
9469 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9470 "support 16 bit addresses\n");
9471 return Op;
9472 }
9473 }
9474
9475 // We've dealt with incorrect input so we know that if IsA16, IsG16
9476 // are set then we have to compress/pack operands (either address,
9477 // gradient or both)
9478 // In the case where a16 and gradients are tied (no G16 support) then we
9479 // have already verified that both IsA16 and IsG16 are true
9480 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9481 // Activate g16
9482 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9484 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9485 }
9486
9487 // Add gradients (packed or unpacked)
9488 if (IsG16) {
9489 // Pack the gradients
9490 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9491 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9492 ArgOffset + Intr->GradientStart,
9493 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9494 } else {
9495 for (unsigned I = ArgOffset + Intr->GradientStart;
9496 I < ArgOffset + Intr->CoordStart; I++)
9497 VAddrs.push_back(Op.getOperand(I));
9498 }
9499
9500 // Add addresses (packed or unpacked)
9501 if (IsA16) {
9502 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9503 ArgOffset + Intr->CoordStart, VAddrEnd,
9504 0 /* No gradients */);
9505 } else {
9506 // Add uncompressed address
9507 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9508 VAddrs.push_back(Op.getOperand(I));
9509 }
9510
9511 // If the register allocator cannot place the address registers contiguously
9512 // without introducing moves, then using the non-sequential address encoding
9513 // is always preferable, since it saves VALU instructions and is usually a
9514 // wash in terms of code size or even better.
9515 //
9516 // However, we currently have no way of hinting to the register allocator that
9517 // MIMG addresses should be placed contiguously when it is possible to do so,
9518 // so force non-NSA for the common 2-address case as a heuristic.
9519 //
9520 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9521 // allocation when possible.
9522 //
9523 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9524 // set of the remaining addresses.
9525 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9526 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9527 const bool UseNSA = ST->hasNSAEncoding() &&
9528 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9529 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9530 const bool UsePartialNSA =
9531 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9532
9533 SDValue VAddr;
9534 if (UsePartialNSA) {
9535 VAddr = getBuildDwordsVector(DAG, DL,
9536 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9537 } else if (!UseNSA) {
9538 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9539 }
9540
9541 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9542 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9543 SDValue Unorm;
9544 if (!BaseOpcode->Sampler) {
9545 Unorm = True;
9546 } else {
9547 uint64_t UnormConst =
9548 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9549
9550 Unorm = UnormConst ? True : False;
9551 }
9552
9553 SDValue TFE;
9554 SDValue LWE;
9555 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9556 bool IsTexFail = false;
9557 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9558 return Op;
9559
9560 if (IsTexFail) {
9561 if (!DMaskLanes) {
9562 // Expecting to get an error flag since TFC is on - and dmask is 0
9563 // Force dmask to be at least 1 otherwise the instruction will fail
9564 DMask = 0x1;
9565 DMaskLanes = 1;
9566 NumVDataDwords = 1;
9567 }
9568 NumVDataDwords += 1;
9569 AdjustRetType = true;
9570 }
9571
9572 // Has something earlier tagged that the return type needs adjusting
9573 // This happens if the instruction is a load or has set TexFailCtrl flags
9574 if (AdjustRetType) {
9575 // NumVDataDwords reflects the true number of dwords required in the return
9576 // type
9577 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9578 // This is a no-op load. This can be eliminated
9579 SDValue Undef = DAG.getPOISON(Op.getValueType());
9580 if (isa<MemSDNode>(Op))
9581 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9582 return Undef;
9583 }
9584
9585 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9586 MVT::i32, NumVDataDwords)
9587 : MVT::i32;
9588
9589 ResultTypes[0] = NewVT;
9590 if (ResultTypes.size() == 3) {
9591 // Original result was aggregate type used for TexFailCtrl results
9592 // The actual instruction returns as a vector type which has now been
9593 // created. Remove the aggregate result.
9594 ResultTypes.erase(&ResultTypes[1]);
9595 }
9596 }
9597
9598 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9599 // Keep GLC only when the atomic's result is actually used.
9600 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9602 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9604 return Op;
9605
9607 if (BaseOpcode->Store || BaseOpcode->Atomic)
9608 Ops.push_back(VData); // vdata
9609 if (UsePartialNSA) {
9610 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9611 Ops.push_back(VAddr);
9612 } else if (UseNSA)
9613 append_range(Ops, VAddrs);
9614 else
9615 Ops.push_back(VAddr);
9616 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9617 EVT RsrcVT = Rsrc.getValueType();
9618 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9619 return Op;
9620 Ops.push_back(Rsrc);
9621 if (BaseOpcode->Sampler) {
9622 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9623 if (Samp.getValueType() != MVT::v4i32)
9624 return Op;
9625 Ops.push_back(Samp);
9626 }
9627 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9628 if (IsGFX10Plus)
9629 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9630 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9631 Ops.push_back(Unorm);
9632 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9633 Ops.push_back(IsA16 && // r128, a16 for gfx9
9634 ST->hasFeature(AMDGPU::FeatureR128A16)
9635 ? True
9636 : False);
9637 if (IsGFX10Plus)
9638 Ops.push_back(IsA16 ? True : False);
9639
9640 if (!Subtarget->hasGFX90AInsts())
9641 Ops.push_back(TFE); // tfe
9642 else if (TFE->getAsZExtVal()) {
9643 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9645 "TFE is not supported on this GPU", DL.getDebugLoc()));
9646 }
9647
9648 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9649 Ops.push_back(LWE); // lwe
9650 if (!IsGFX10Plus)
9651 Ops.push_back(DimInfo->DA ? True : False);
9652 if (BaseOpcode->HasD16)
9653 Ops.push_back(IsD16 ? True : False);
9654 if (isa<MemSDNode>(Op))
9655 Ops.push_back(Op.getOperand(0)); // chain
9656
9657 int NumVAddrDwords =
9658 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9659 int Opcode = -1;
9660
9661 if (IsGFX12Plus) {
9662 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9663 NumVDataDwords, NumVAddrDwords);
9664 } else if (IsGFX11Plus) {
9665 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9666 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9667 : AMDGPU::MIMGEncGfx11Default,
9668 NumVDataDwords, NumVAddrDwords);
9669 } else if (IsGFX10Plus) {
9670 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9671 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9672 : AMDGPU::MIMGEncGfx10Default,
9673 NumVDataDwords, NumVAddrDwords);
9674 } else {
9675 if (Subtarget->hasGFX90AInsts()) {
9676 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9677 NumVDataDwords, NumVAddrDwords);
9678 if (Opcode == -1) {
9679 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9681 "requested image instruction is not supported on this GPU",
9682 DL.getDebugLoc()));
9683
9684 unsigned Idx = 0;
9685 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9686 for (EVT VT : OrigResultTypes) {
9687 if (VT == MVT::Other)
9688 RetValues[Idx++] = Op.getOperand(0); // Chain
9689 else
9690 RetValues[Idx++] = DAG.getPOISON(VT);
9691 }
9692
9693 return DAG.getMergeValues(RetValues, DL);
9694 }
9695 }
9696 if (Opcode == -1 &&
9697 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9698 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9699 NumVDataDwords, NumVAddrDwords);
9700 if (Opcode == -1)
9701 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9702 NumVDataDwords, NumVAddrDwords);
9703 }
9704 if (Opcode == -1)
9705 return Op;
9706
9707 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9708 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9709 MachineMemOperand *MemRef = MemOp->getMemOperand();
9710 DAG.setNodeMemRefs(NewNode, {MemRef});
9711 }
9712
9713 if (BaseOpcode->NoReturn) {
9714 if (BaseOpcode->Atomic)
9715 return DAG.getMergeValues(
9716 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9717
9718 return SDValue(NewNode, 0);
9719 }
9720
9721 if (BaseOpcode->AtomicX2) {
9723 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9724 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9725 }
9726
9727 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9728 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9729 NumVDataDwords, IsAtomicPacked16Bit, DL);
9730}
9731
9732SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9733 SDValue Offset, SDValue CachePolicy,
9734 SelectionDAG &DAG) const {
9735 MachineFunction &MF = DAG.getMachineFunction();
9736
9737 const DataLayout &DataLayout = DAG.getDataLayout();
9738 Align Alignment =
9739 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9740
9741 MachineMemOperand *MMO = MF.getMachineMemOperand(
9742 MachinePointerInfo(),
9745 VT.getStoreSize(), Alignment);
9746
9747 if (!Offset->isDivergent()) {
9748 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9749
9750 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9751 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9752 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9753 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9754 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9755 SDValue BufferLoad =
9756 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
9757 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9758 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9759 }
9760
9761 // Widen vec3 load to vec4.
9762 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9763 !Subtarget->hasScalarDwordx3Loads()) {
9764 EVT WidenedVT =
9766 auto WidenedOp = DAG.getMemIntrinsicNode(
9767 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9768 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9769 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9770 DAG.getVectorIdxConstant(0, DL));
9771 return Subvector;
9772 }
9773
9774 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
9775 DAG.getVTList(VT), Ops, VT, MMO);
9776 }
9777
9778 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9779 // assume that the buffer is unswizzled.
9780 SDValue Ops[] = {
9781 DAG.getEntryNode(), // Chain
9782 Rsrc, // rsrc
9783 DAG.getConstant(0, DL, MVT::i32), // vindex
9784 {}, // voffset
9785 {}, // soffset
9786 {}, // offset
9787 CachePolicy, // cachepolicy
9788 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9789 };
9790 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9791 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9792 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9793 }
9794
9796 unsigned NumLoads = 1;
9797 MVT LoadVT = VT.getSimpleVT();
9798 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9799 assert((LoadVT.getScalarType() == MVT::i32 ||
9800 LoadVT.getScalarType() == MVT::f32));
9801
9802 if (NumElts == 8 || NumElts == 16) {
9803 NumLoads = NumElts / 4;
9804 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9805 }
9806
9807 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9808
9809 // Use the alignment to ensure that the required offsets will fit into the
9810 // immediate offsets.
9811 setBufferOffsets(Offset, DAG, &Ops[3],
9812 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9813
9814 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9815 for (unsigned i = 0; i < NumLoads; ++i) {
9816 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9817 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9818 LoadVT, MMO, DAG));
9819 }
9820
9821 if (NumElts == 8 || NumElts == 16)
9822 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9823
9824 return Loads[0];
9825}
9826
9827SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9828 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9829 if (!Subtarget->hasArchitectedSGPRs())
9830 return {};
9831 SDLoc SL(Op);
9832 MVT VT = MVT::i32;
9833 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9834 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9835 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9836}
9837
9838SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9839 AMDGPU::Hwreg::Id HwReg,
9840 unsigned LowBit,
9841 unsigned Width) const {
9842 SDLoc SL(Op);
9843 using namespace AMDGPU::Hwreg;
9844 return {DAG.getMachineNode(
9845 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9846 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9847 SL, MVT::i32)),
9848 0};
9849}
9850
9851SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9852 unsigned Dim,
9853 const ArgDescriptor &Arg) const {
9854 SDLoc SL(Op);
9855 MachineFunction &MF = DAG.getMachineFunction();
9856 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9857 if (MaxID == 0)
9858 return DAG.getConstant(0, SL, MVT::i32);
9859
9860 // It's undefined behavior if a function marked with the amdgpu-no-*
9861 // attributes uses the corresponding intrinsic.
9862 if (!Arg)
9863 return DAG.getPOISON(Op->getValueType(0));
9864
9865 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9866 SDLoc(DAG.getEntryNode()), Arg);
9867
9868 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9869 // masking operations anyway.
9870 //
9871 // TODO: We could assert the top bit is 0 for the source copy.
9872 if (Arg.isMasked())
9873 return Val;
9874
9875 // Preserve the known bits after expansion to a copy.
9876 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9877 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9878 DAG.getValueType(SmallVT));
9879}
9880
9881SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9882 SelectionDAG &DAG) const {
9883 MachineFunction &MF = DAG.getMachineFunction();
9884 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9885
9886 EVT VT = Op.getValueType();
9887 SDLoc DL(Op);
9888 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9889
9890 // TODO: Should this propagate fast-math-flags?
9891
9892 switch (IntrinsicID) {
9893 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9894 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9895 return emitNonHSAIntrinsicError(DAG, DL, VT);
9896 return getPreloadedValue(DAG, *MFI, VT,
9898 }
9899 case Intrinsic::amdgcn_dispatch_ptr:
9900 case Intrinsic::amdgcn_queue_ptr: {
9901 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9902 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9903 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9904 DL.getDebugLoc()));
9905 return DAG.getPOISON(VT);
9906 }
9907
9908 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9911 return getPreloadedValue(DAG, *MFI, VT, RegID);
9912 }
9913 case Intrinsic::amdgcn_implicitarg_ptr: {
9914 if (MFI->isEntryFunction())
9915 return getImplicitArgPtr(DAG, DL);
9916 return getPreloadedValue(DAG, *MFI, VT,
9918 }
9919 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9920 if (!AMDGPU::isKernel(MF.getFunction())) {
9921 // This only makes sense to call in a kernel, so just lower to null.
9922 return DAG.getConstant(0, DL, VT);
9923 }
9924
9925 return getPreloadedValue(DAG, *MFI, VT,
9927 }
9928 case Intrinsic::amdgcn_dispatch_id: {
9929 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9930 }
9931 case Intrinsic::amdgcn_rcp:
9932 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9933 case Intrinsic::amdgcn_rsq:
9934 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9935 case Intrinsic::amdgcn_rsq_legacy:
9936 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9937 return emitRemovedIntrinsicError(DAG, DL, VT);
9938 return SDValue();
9939 case Intrinsic::amdgcn_rcp_legacy:
9940 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9941 return emitRemovedIntrinsicError(DAG, DL, VT);
9942 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9943 case Intrinsic::amdgcn_rsq_clamp: {
9944 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9945 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9946
9947 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9948 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9949 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9950
9951 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9952 SDValue Tmp =
9953 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9954 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9955 DAG.getConstantFP(Min, DL, VT));
9956 }
9957 case Intrinsic::r600_read_ngroups_x:
9958 if (Subtarget->isAmdHsaOS())
9959 return emitNonHSAIntrinsicError(DAG, DL, VT);
9960
9961 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9963 false);
9964 case Intrinsic::r600_read_ngroups_y:
9965 if (Subtarget->isAmdHsaOS())
9966 return emitNonHSAIntrinsicError(DAG, DL, VT);
9967
9968 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9970 false);
9971 case Intrinsic::r600_read_ngroups_z:
9972 if (Subtarget->isAmdHsaOS())
9973 return emitNonHSAIntrinsicError(DAG, DL, VT);
9974
9975 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9977 false);
9978 case Intrinsic::r600_read_local_size_x:
9979 if (Subtarget->isAmdHsaOS())
9980 return emitNonHSAIntrinsicError(DAG, DL, VT);
9981
9982 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9984 case Intrinsic::r600_read_local_size_y:
9985 if (Subtarget->isAmdHsaOS())
9986 return emitNonHSAIntrinsicError(DAG, DL, VT);
9987
9988 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9990 case Intrinsic::r600_read_local_size_z:
9991 if (Subtarget->isAmdHsaOS())
9992 return emitNonHSAIntrinsicError(DAG, DL, VT);
9993
9994 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9996 case Intrinsic::amdgcn_workgroup_id_x:
9997 return lowerWorkGroupId(DAG, *MFI, VT,
10001 case Intrinsic::amdgcn_workgroup_id_y:
10002 return lowerWorkGroupId(DAG, *MFI, VT,
10006 case Intrinsic::amdgcn_workgroup_id_z:
10007 return lowerWorkGroupId(DAG, *MFI, VT,
10011 case Intrinsic::amdgcn_cluster_id_x:
10012 return Subtarget->hasClusters()
10013 ? getPreloadedValue(DAG, *MFI, VT,
10015 : DAG.getPOISON(VT);
10016 case Intrinsic::amdgcn_cluster_id_y:
10017 return Subtarget->hasClusters()
10018 ? getPreloadedValue(DAG, *MFI, VT,
10020 : DAG.getPOISON(VT);
10021 case Intrinsic::amdgcn_cluster_id_z:
10022 return Subtarget->hasClusters()
10023 ? getPreloadedValue(DAG, *MFI, VT,
10025 : DAG.getPOISON(VT);
10026 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10027 return Subtarget->hasClusters()
10028 ? getPreloadedValue(
10029 DAG, *MFI, VT,
10031 : DAG.getPOISON(VT);
10032 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10033 return Subtarget->hasClusters()
10034 ? getPreloadedValue(
10035 DAG, *MFI, VT,
10037 : DAG.getPOISON(VT);
10038 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10039 return Subtarget->hasClusters()
10040 ? getPreloadedValue(
10041 DAG, *MFI, VT,
10043 : DAG.getPOISON(VT);
10044 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10045 return Subtarget->hasClusters()
10046 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
10047 : SDValue();
10048 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10049 return Subtarget->hasClusters()
10050 ? getPreloadedValue(
10051 DAG, *MFI, VT,
10053 : DAG.getPOISON(VT);
10054 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10055 return Subtarget->hasClusters()
10056 ? getPreloadedValue(
10057 DAG, *MFI, VT,
10059 : DAG.getPOISON(VT);
10060 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10061 return Subtarget->hasClusters()
10062 ? getPreloadedValue(
10063 DAG, *MFI, VT,
10065 : DAG.getPOISON(VT);
10066 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10067 return Subtarget->hasClusters()
10068 ? getPreloadedValue(
10069 DAG, *MFI, VT,
10071 : DAG.getPOISON(VT);
10072 case Intrinsic::amdgcn_wave_id:
10073 return lowerWaveID(DAG, Op);
10074 case Intrinsic::amdgcn_lds_kernel_id: {
10075 if (MFI->isEntryFunction())
10076 return getLDSKernelId(DAG, DL);
10077 return getPreloadedValue(DAG, *MFI, VT,
10079 }
10080 case Intrinsic::amdgcn_workitem_id_x:
10081 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10082 case Intrinsic::amdgcn_workitem_id_y:
10083 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10084 case Intrinsic::amdgcn_workitem_id_z:
10085 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10086 case Intrinsic::amdgcn_wavefrontsize:
10087 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10088 SDLoc(Op), MVT::i32);
10089 case Intrinsic::amdgcn_s_buffer_load: {
10090 unsigned CPol = Op.getConstantOperandVal(3);
10091 // s_buffer_load, because of how it's optimized, can't be volatile
10092 // so reject ones with the volatile bit set.
10093 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10096 return Op;
10097 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10098 Op.getOperand(3), DAG);
10099 }
10100 case Intrinsic::amdgcn_fdiv_fast:
10101 return lowerFDIV_FAST(Op, DAG);
10102 case Intrinsic::amdgcn_sin:
10103 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10104
10105 case Intrinsic::amdgcn_cos:
10106 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10107
10108 case Intrinsic::amdgcn_mul_u24:
10109 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10110 Op.getOperand(2));
10111 case Intrinsic::amdgcn_mul_i24:
10112 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10113 Op.getOperand(2));
10114
10115 case Intrinsic::amdgcn_log_clamp: {
10116 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10117 return SDValue();
10118
10119 return emitRemovedIntrinsicError(DAG, DL, VT);
10120 }
10121 case Intrinsic::amdgcn_fract:
10122 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10123
10124 case Intrinsic::amdgcn_class:
10125 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10126 Op.getOperand(2));
10127 case Intrinsic::amdgcn_div_fmas:
10128 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10129 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10130
10131 case Intrinsic::amdgcn_div_fixup:
10132 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10133 Op.getOperand(2), Op.getOperand(3));
10134
10135 case Intrinsic::amdgcn_div_scale: {
10136 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10137
10138 // Translate to the operands expected by the machine instruction. The
10139 // first parameter must be the same as the first instruction.
10140 SDValue Numerator = Op.getOperand(1);
10141 SDValue Denominator = Op.getOperand(2);
10142
10143 // Note this order is opposite of the machine instruction's operations,
10144 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10145 // intrinsic has the numerator as the first operand to match a normal
10146 // division operation.
10147
10148 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10149
10150 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10151 Denominator, Numerator);
10152 }
10153 case Intrinsic::amdgcn_icmp: {
10154 // There is a Pat that handles this variant, so return it as-is.
10155 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10156 Op.getConstantOperandVal(2) == 0 &&
10157 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10158 return Op;
10159 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10160 }
10161 case Intrinsic::amdgcn_fcmp: {
10162 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10163 }
10164 case Intrinsic::amdgcn_ballot:
10165 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10166 case Intrinsic::amdgcn_fmed3:
10167 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10168 Op.getOperand(2), Op.getOperand(3));
10169 case Intrinsic::amdgcn_fdot2:
10170 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10171 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10172 case Intrinsic::amdgcn_fmul_legacy:
10173 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10174 Op.getOperand(2));
10175 case Intrinsic::amdgcn_sffbh:
10176 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10177 case Intrinsic::amdgcn_sbfe:
10178 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10179 Op.getOperand(2), Op.getOperand(3));
10180 case Intrinsic::amdgcn_ubfe:
10181 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10182 Op.getOperand(2), Op.getOperand(3));
10183 case Intrinsic::amdgcn_cvt_pkrtz:
10184 case Intrinsic::amdgcn_cvt_pknorm_i16:
10185 case Intrinsic::amdgcn_cvt_pknorm_u16:
10186 case Intrinsic::amdgcn_cvt_pk_i16:
10187 case Intrinsic::amdgcn_cvt_pk_u16: {
10188 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10189 EVT VT = Op.getValueType();
10190 unsigned Opcode;
10191
10192 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10193 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10194 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10195 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10196 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10197 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10198 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10199 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10200 else
10201 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10202
10203 if (isTypeLegal(VT))
10204 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10205
10206 SDValue Node =
10207 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10208 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10209 }
10210 case Intrinsic::amdgcn_fmad_ftz:
10211 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10212 Op.getOperand(2), Op.getOperand(3));
10213
10214 case Intrinsic::amdgcn_if_break:
10215 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10216 Op->getOperand(1), Op->getOperand(2)),
10217 0);
10218
10219 case Intrinsic::amdgcn_groupstaticsize: {
10221 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10222 return Op;
10223
10224 const Module *M = MF.getFunction().getParent();
10225 const GlobalValue *GV =
10226 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10227 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10229 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10230 }
10231 case Intrinsic::amdgcn_is_shared:
10232 case Intrinsic::amdgcn_is_private: {
10233 SDLoc SL(Op);
10234 SDValue SrcVec =
10235 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10236 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10237 DAG.getConstant(1, SL, MVT::i32));
10238
10239 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10241 : AMDGPUAS::PRIVATE_ADDRESS;
10242 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10243 Subtarget->hasGloballyAddressableScratch()) {
10244 SDValue FlatScratchBaseHi(
10245 DAG.getMachineNode(
10246 AMDGPU::S_MOV_B32, DL, MVT::i32,
10247 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10248 0);
10249 // Test bits 63..58 against the aperture address.
10250 return DAG.getSetCC(
10251 SL, MVT::i1,
10252 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10253 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10254 }
10255
10256 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10257 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10258 }
10259 case Intrinsic::amdgcn_perm:
10260 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10261 Op.getOperand(2), Op.getOperand(3));
10262 case Intrinsic::amdgcn_reloc_constant: {
10263 Module *M = MF.getFunction().getParent();
10264 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10265 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10266 auto *RelocSymbol = cast<GlobalVariable>(
10267 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10268 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10270 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10271 }
10272 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10273 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10274 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10275 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10276 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10277 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10278 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10279 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10280 if (Op.getOperand(4).getValueType() == MVT::i32)
10281 return SDValue();
10282
10283 SDLoc SL(Op);
10284 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10285 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10286 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10287 Op.getOperand(3), IndexKeyi32);
10288 }
10289 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10290 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10291 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10292 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10293 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10294 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10295 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10296 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10297 if (Op.getOperand(4).getValueType() == MVT::i64)
10298 return SDValue();
10299
10300 SDLoc SL(Op);
10301 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10302 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10303 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10304 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10305 Op.getOperand(6)});
10306 }
10307 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10308 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10309 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10310 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10311 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10312 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10313 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10314 ? MVT::i64
10315 : MVT::i32;
10316 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10317 return SDValue();
10318
10319 SDLoc SL(Op);
10320 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10322 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10323 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10324 IndexKey, Op.getOperand(7), Op.getOperand(8)};
10325 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10326 Args.push_back(Op.getOperand(9));
10327 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
10328 }
10329 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10330 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10331 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10332 if (Op.getOperand(6).getValueType() == MVT::i32)
10333 return SDValue();
10334
10335 SDLoc SL(Op);
10336 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10337 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10338 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10339 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10340 IndexKeyi32, Op.getOperand(7)});
10341 }
10342 case Intrinsic::amdgcn_addrspacecast_nonnull:
10343 return lowerADDRSPACECAST(Op, DAG);
10344 case Intrinsic::amdgcn_readlane:
10345 case Intrinsic::amdgcn_readfirstlane:
10346 case Intrinsic::amdgcn_writelane:
10347 case Intrinsic::amdgcn_permlane16:
10348 case Intrinsic::amdgcn_permlanex16:
10349 case Intrinsic::amdgcn_permlane64:
10350 case Intrinsic::amdgcn_set_inactive:
10351 case Intrinsic::amdgcn_set_inactive_chain_arg:
10352 case Intrinsic::amdgcn_mov_dpp8:
10353 case Intrinsic::amdgcn_update_dpp:
10354 return lowerLaneOp(*this, Op.getNode(), DAG);
10355 case Intrinsic::amdgcn_dead: {
10357 for (const EVT ValTy : Op.getNode()->values())
10358 Poisons.push_back(DAG.getPOISON(ValTy));
10359 return DAG.getMergeValues(Poisons, SDLoc(Op));
10360 }
10361 case Intrinsic::amdgcn_wave_shuffle:
10362 return lowerWaveShuffle(*this, Op.getNode(), DAG);
10363 default:
10364 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10366 return lowerImage(Op, ImageDimIntr, DAG, false);
10367
10368 return Op;
10369 }
10370}
10371
10372// On targets not supporting constant in soffset field, turn zero to
10373// SGPR_NULL to avoid generating an extra s_mov with zero.
10375 const GCNSubtarget *Subtarget) {
10376 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10377 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10378 return SOffset;
10379}
10380
10381SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10382 SelectionDAG &DAG,
10383 unsigned NewOpcode) const {
10384 SDLoc DL(Op);
10385
10386 SDValue VData = Op.getOperand(2);
10387 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10388 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10389 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10390 SDValue Ops[] = {
10391 Op.getOperand(0), // Chain
10392 VData, // vdata
10393 Rsrc, // rsrc
10394 DAG.getConstant(0, DL, MVT::i32), // vindex
10395 VOffset, // voffset
10396 SOffset, // soffset
10397 Offset, // offset
10398 Op.getOperand(6), // cachepolicy
10399 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10400 };
10401
10402 auto *M = cast<MemSDNode>(Op);
10403
10404 EVT MemVT = VData.getValueType();
10405 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10406 M->getMemOperand());
10407}
10408
10409SDValue
10410SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10411 unsigned NewOpcode) const {
10412 SDLoc DL(Op);
10413
10414 SDValue VData = Op.getOperand(2);
10415 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10416 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10417 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10418 SDValue Ops[] = {
10419 Op.getOperand(0), // Chain
10420 VData, // vdata
10421 Rsrc, // rsrc
10422 Op.getOperand(4), // vindex
10423 VOffset, // voffset
10424 SOffset, // soffset
10425 Offset, // offset
10426 Op.getOperand(7), // cachepolicy
10427 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10428 };
10429
10430 auto *M = cast<MemSDNode>(Op);
10431
10432 EVT MemVT = VData.getValueType();
10433 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10434 M->getMemOperand());
10435}
10436
10437SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10438 SelectionDAG &DAG) const {
10439 unsigned IntrID = Op.getConstantOperandVal(1);
10440 SDLoc DL(Op);
10441
10442 switch (IntrID) {
10443 case Intrinsic::amdgcn_ds_ordered_add:
10444 case Intrinsic::amdgcn_ds_ordered_swap: {
10445 MemSDNode *M = cast<MemSDNode>(Op);
10446 SDValue Chain = M->getOperand(0);
10447 SDValue M0 = M->getOperand(2);
10448 SDValue Value = M->getOperand(3);
10449 unsigned IndexOperand = M->getConstantOperandVal(7);
10450 unsigned WaveRelease = M->getConstantOperandVal(8);
10451 unsigned WaveDone = M->getConstantOperandVal(9);
10452
10453 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10454 IndexOperand &= ~0x3f;
10455 unsigned CountDw = 0;
10456
10457 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10458 CountDw = (IndexOperand >> 24) & 0xf;
10459 IndexOperand &= ~(0xf << 24);
10460
10461 if (CountDw < 1 || CountDw > 4) {
10462 const Function &Fn = DAG.getMachineFunction().getFunction();
10463 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10464 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10465 DL.getDebugLoc()));
10466 CountDw = 1;
10467 }
10468 }
10469
10470 if (IndexOperand) {
10471 const Function &Fn = DAG.getMachineFunction().getFunction();
10472 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10473 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10474 }
10475
10476 if (WaveDone && !WaveRelease) {
10477 // TODO: Move this to IR verifier
10478 const Function &Fn = DAG.getMachineFunction().getFunction();
10479 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10480 Fn, "ds_ordered_count: wave_done requires wave_release",
10481 DL.getDebugLoc()));
10482 }
10483
10484 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10485 unsigned ShaderType =
10487 unsigned Offset0 = OrderedCountIndex << 2;
10488 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10489
10490 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10491 Offset1 |= (CountDw - 1) << 6;
10492
10493 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10494 Offset1 |= ShaderType << 2;
10495
10496 unsigned Offset = Offset0 | (Offset1 << 8);
10497
10498 SDValue Ops[] = {
10499 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10500 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10501 };
10502 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10503 M->getVTList(), Ops, M->getMemoryVT(),
10504 M->getMemOperand());
10505 }
10506 case Intrinsic::amdgcn_raw_buffer_load:
10507 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10508 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10509 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10510 case Intrinsic::amdgcn_raw_buffer_load_format:
10511 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10512 const bool IsFormat =
10513 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10514 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10515
10516 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10517 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10518 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10519 SDValue Ops[] = {
10520 Op.getOperand(0), // Chain
10521 Rsrc, // rsrc
10522 DAG.getConstant(0, DL, MVT::i32), // vindex
10523 VOffset, // voffset
10524 SOffset, // soffset
10525 Offset, // offset
10526 Op.getOperand(5), // cachepolicy, swizzled buffer
10527 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10528 };
10529
10530 auto *M = cast<MemSDNode>(Op);
10531 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10532 }
10533 case Intrinsic::amdgcn_struct_buffer_load:
10534 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10535 case Intrinsic::amdgcn_struct_buffer_load_format:
10536 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10537 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10538 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10539 const bool IsFormat =
10540 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10541 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10542
10543 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10544 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10545 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10546 SDValue Ops[] = {
10547 Op.getOperand(0), // Chain
10548 Rsrc, // rsrc
10549 Op.getOperand(3), // vindex
10550 VOffset, // voffset
10551 SOffset, // soffset
10552 Offset, // offset
10553 Op.getOperand(6), // cachepolicy, swizzled buffer
10554 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10555 };
10556
10557 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10558 }
10559 case Intrinsic::amdgcn_raw_tbuffer_load:
10560 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10561 MemSDNode *M = cast<MemSDNode>(Op);
10562 EVT LoadVT = Op.getValueType();
10563 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10564 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10565 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10566
10567 SDValue Ops[] = {
10568 Op.getOperand(0), // Chain
10569 Rsrc, // rsrc
10570 DAG.getConstant(0, DL, MVT::i32), // vindex
10571 VOffset, // voffset
10572 SOffset, // soffset
10573 Offset, // offset
10574 Op.getOperand(5), // format
10575 Op.getOperand(6), // cachepolicy, swizzled buffer
10576 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10577 };
10578
10579 if (LoadVT.getScalarType() == MVT::f16)
10580 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10581 Ops);
10582 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10583 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10584 DAG);
10585 }
10586 case Intrinsic::amdgcn_struct_tbuffer_load:
10587 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10588 MemSDNode *M = cast<MemSDNode>(Op);
10589 EVT LoadVT = Op.getValueType();
10590 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10591 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10592 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10593
10594 SDValue Ops[] = {
10595 Op.getOperand(0), // Chain
10596 Rsrc, // rsrc
10597 Op.getOperand(3), // vindex
10598 VOffset, // voffset
10599 SOffset, // soffset
10600 Offset, // offset
10601 Op.getOperand(6), // format
10602 Op.getOperand(7), // cachepolicy, swizzled buffer
10603 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10604 };
10605
10606 if (LoadVT.getScalarType() == MVT::f16)
10607 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10608 Ops);
10609 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10610 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10611 DAG);
10612 }
10613 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10614 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10615 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10616 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10617 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10618 return lowerStructBufferAtomicIntrin(Op, DAG,
10619 AMDGPUISD::BUFFER_ATOMIC_FADD);
10620 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10621 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10622 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10623 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10624 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10625 return lowerStructBufferAtomicIntrin(Op, DAG,
10626 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10627 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10628 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10629 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10630 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10631 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10632 return lowerStructBufferAtomicIntrin(Op, DAG,
10633 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10634 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10635 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10636 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10637 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10638 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10639 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10640 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10641 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10642 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10643 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10644 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10645 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10646 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10647 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10648 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10649 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10650 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10651 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10652 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10653 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10654 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10655 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10656 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10657 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10658 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10659 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10660 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10661 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10662 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10663 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10664 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10665 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10666 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10667 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10668 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10669 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10670 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10671 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10672 return lowerStructBufferAtomicIntrin(Op, DAG,
10673 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10674 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10675 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10676 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10677 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10678 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10679 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10680 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10681 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10682 return lowerStructBufferAtomicIntrin(Op, DAG,
10683 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10684 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10685 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10686 return lowerStructBufferAtomicIntrin(Op, DAG,
10687 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10688 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10689 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10690 return lowerStructBufferAtomicIntrin(Op, DAG,
10691 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10692 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10693 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10694 return lowerStructBufferAtomicIntrin(Op, DAG,
10695 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10696 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10697 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10698 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10699 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10700 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10701 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10702 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10703 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10704 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10705 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10706 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10707 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10708 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10709 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10710 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10711 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10712 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10713 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10714 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10715 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10716 return lowerStructBufferAtomicIntrin(Op, DAG,
10717 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10718 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10719 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10720 return lowerRawBufferAtomicIntrin(Op, DAG,
10721 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10722 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10723 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10724 return lowerStructBufferAtomicIntrin(Op, DAG,
10725 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10726 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10727 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10728 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10729 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10730 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10731 SDValue Ops[] = {
10732 Op.getOperand(0), // Chain
10733 Op.getOperand(2), // src
10734 Op.getOperand(3), // cmp
10735 Rsrc, // rsrc
10736 DAG.getConstant(0, DL, MVT::i32), // vindex
10737 VOffset, // voffset
10738 SOffset, // soffset
10739 Offset, // offset
10740 Op.getOperand(7), // cachepolicy
10741 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10742 };
10743 EVT VT = Op.getValueType();
10744 auto *M = cast<MemSDNode>(Op);
10745
10746 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10747 Op->getVTList(), Ops, VT,
10748 M->getMemOperand());
10749 }
10750 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10751 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10752 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10753 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10754 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10755 SDValue Ops[] = {
10756 Op.getOperand(0), // Chain
10757 Op.getOperand(2), // src
10758 Op.getOperand(3), // cmp
10759 Rsrc, // rsrc
10760 Op.getOperand(5), // vindex
10761 VOffset, // voffset
10762 SOffset, // soffset
10763 Offset, // offset
10764 Op.getOperand(8), // cachepolicy
10765 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10766 };
10767 EVT VT = Op.getValueType();
10768 auto *M = cast<MemSDNode>(Op);
10769
10770 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10771 Op->getVTList(), Ops, VT,
10772 M->getMemOperand());
10773 }
10774 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10775 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10776 MemSDNode *M = cast<MemSDNode>(Op);
10777 SDValue NodePtr = M->getOperand(2);
10778 SDValue RayExtent = M->getOperand(3);
10779 SDValue InstanceMask = M->getOperand(4);
10780 SDValue RayOrigin = M->getOperand(5);
10781 SDValue RayDir = M->getOperand(6);
10782 SDValue Offsets = M->getOperand(7);
10783 SDValue TDescr = M->getOperand(8);
10784
10785 assert(NodePtr.getValueType() == MVT::i64);
10786 assert(RayDir.getValueType() == MVT::v3f32);
10787
10788 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10789 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10790 return SDValue();
10791 }
10792
10793 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10794 const unsigned NumVDataDwords = 10;
10795 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10796 int Opcode = AMDGPU::getMIMGOpcode(
10797 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10798 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10799 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10800 assert(Opcode != -1);
10801
10803 Ops.push_back(NodePtr);
10804 Ops.push_back(DAG.getBuildVector(
10805 MVT::v2i32, DL,
10806 {DAG.getBitcast(MVT::i32, RayExtent),
10807 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10808 Ops.push_back(RayOrigin);
10809 Ops.push_back(RayDir);
10810 Ops.push_back(Offsets);
10811 Ops.push_back(TDescr);
10812 Ops.push_back(M->getChain());
10813
10814 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10815 MachineMemOperand *MemRef = M->getMemOperand();
10816 DAG.setNodeMemRefs(NewNode, {MemRef});
10817 return SDValue(NewNode, 0);
10818 }
10819 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10820 MemSDNode *M = cast<MemSDNode>(Op);
10821 SDValue NodePtr = M->getOperand(2);
10822 SDValue RayExtent = M->getOperand(3);
10823 SDValue RayOrigin = M->getOperand(4);
10824 SDValue RayDir = M->getOperand(5);
10825 SDValue RayInvDir = M->getOperand(6);
10826 SDValue TDescr = M->getOperand(7);
10827
10828 assert(NodePtr.getValueType() == MVT::i32 ||
10829 NodePtr.getValueType() == MVT::i64);
10830 assert(RayDir.getValueType() == MVT::v3f16 ||
10831 RayDir.getValueType() == MVT::v3f32);
10832
10833 if (!Subtarget->hasGFX10_AEncoding()) {
10834 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10835 return SDValue();
10836 }
10837
10838 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10839 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10840 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10841 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10842 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10843 const unsigned NumVDataDwords = 4;
10844 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10845 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10846 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10847 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10848 IsGFX12Plus;
10849 const unsigned BaseOpcodes[2][2] = {
10850 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10851 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10852 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10853 int Opcode;
10854 if (UseNSA) {
10855 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10856 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10857 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10858 : AMDGPU::MIMGEncGfx10NSA,
10859 NumVDataDwords, NumVAddrDwords);
10860 } else {
10861 assert(!IsGFX12Plus);
10862 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10863 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10864 : AMDGPU::MIMGEncGfx10Default,
10865 NumVDataDwords, NumVAddrDwords);
10866 }
10867 assert(Opcode != -1);
10868
10870
10871 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10873 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10874 if (Lanes[0].getValueSizeInBits() == 32) {
10875 for (unsigned I = 0; I < 3; ++I)
10876 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10877 } else {
10878 if (IsAligned) {
10879 Ops.push_back(DAG.getBitcast(
10880 MVT::i32,
10881 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10882 Ops.push_back(Lanes[2]);
10883 } else {
10884 SDValue Elt0 = Ops.pop_back_val();
10885 Ops.push_back(DAG.getBitcast(
10886 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10887 Ops.push_back(DAG.getBitcast(
10888 MVT::i32,
10889 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10890 }
10891 }
10892 };
10893
10894 if (UseNSA && IsGFX11Plus) {
10895 Ops.push_back(NodePtr);
10896 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10897 Ops.push_back(RayOrigin);
10898 if (IsA16) {
10899 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10900 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10901 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10902 for (unsigned I = 0; I < 3; ++I) {
10903 MergedLanes.push_back(DAG.getBitcast(
10904 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10905 {DirLanes[I], InvDirLanes[I]})));
10906 }
10907 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10908 } else {
10909 Ops.push_back(RayDir);
10910 Ops.push_back(RayInvDir);
10911 }
10912 } else {
10913 if (Is64)
10914 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10915 2);
10916 else
10917 Ops.push_back(NodePtr);
10918
10919 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10920 packLanes(RayOrigin, true);
10921 packLanes(RayDir, true);
10922 packLanes(RayInvDir, false);
10923 }
10924
10925 if (!UseNSA) {
10926 // Build a single vector containing all the operands so far prepared.
10927 if (NumVAddrDwords > 12) {
10928 SDValue Undef = DAG.getPOISON(MVT::i32);
10929 Ops.append(16 - Ops.size(), Undef);
10930 }
10931 assert(Ops.size() >= 8 && Ops.size() <= 12);
10932 SDValue MergedOps =
10933 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10934 Ops.clear();
10935 Ops.push_back(MergedOps);
10936 }
10937
10938 Ops.push_back(TDescr);
10939 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10940 Ops.push_back(M->getChain());
10941
10942 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10943 MachineMemOperand *MemRef = M->getMemOperand();
10944 DAG.setNodeMemRefs(NewNode, {MemRef});
10945 return SDValue(NewNode, 0);
10946 }
10947 case Intrinsic::amdgcn_global_atomic_fmin_num:
10948 case Intrinsic::amdgcn_global_atomic_fmax_num:
10949 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10950 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10951 MemSDNode *M = cast<MemSDNode>(Op);
10952 SDValue Ops[] = {
10953 M->getOperand(0), // Chain
10954 M->getOperand(2), // Ptr
10955 M->getOperand(3) // Value
10956 };
10957 unsigned Opcode = 0;
10958 switch (IntrID) {
10959 case Intrinsic::amdgcn_global_atomic_fmin_num:
10960 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10961 Opcode = ISD::ATOMIC_LOAD_FMIN;
10962 break;
10963 }
10964 case Intrinsic::amdgcn_global_atomic_fmax_num:
10965 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10966 Opcode = ISD::ATOMIC_LOAD_FMAX;
10967 break;
10968 }
10969 default:
10970 llvm_unreachable("unhandled atomic opcode");
10971 }
10972 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10973 Ops, M->getMemOperand());
10974 }
10975 case Intrinsic::amdgcn_s_get_barrier_state:
10976 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10977 SDValue Chain = Op->getOperand(0);
10979 unsigned Opc;
10980
10981 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10982 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10983 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10984 BarID = (BarID >> 4) & 0x3F;
10985 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10986 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10987 Ops.push_back(K);
10988 Ops.push_back(Chain);
10989 } else {
10990 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10991 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10992 SDValue M0Val;
10993 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10994 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10995 M0Val = SDValue(
10996 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10997 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10998 0);
10999 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11000 } else
11001 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
11002 }
11003
11004 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11005 return SDValue(NewMI, 0);
11006 }
11007 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11008 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11009 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11010 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11011 SDValue Chain = Op->getOperand(0);
11012 SDValue Ptr = Op->getOperand(2);
11013 EVT VT = Op->getValueType(0);
11014 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
11015 Chain, Ptr, MII->getMemOperand());
11016 }
11017 default:
11018
11019 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11021 return lowerImage(Op, ImageDimIntr, DAG, true);
11022
11023 return SDValue();
11024 }
11025}
11026
11027// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11028// dwordx4 if on SI and handle TFE loads.
11029SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11030 SDVTList VTList,
11031 ArrayRef<SDValue> Ops, EVT MemVT,
11032 MachineMemOperand *MMO,
11033 SelectionDAG &DAG) const {
11034 LLVMContext &C = *DAG.getContext();
11035 MachineFunction &MF = DAG.getMachineFunction();
11036 EVT VT = VTList.VTs[0];
11037
11038 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11039 bool IsTFE = VTList.NumVTs == 3;
11040 if (IsTFE) {
11041 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
11042 unsigned NumOpDWords = NumValueDWords + 1;
11043 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
11044 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
11045 MachineMemOperand *OpDWordsMMO =
11046 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
11047 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
11048 OpDWordsVT, OpDWordsMMO, DAG);
11049 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11050 DAG.getVectorIdxConstant(NumValueDWords, DL));
11051 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
11052 SDValue ValueDWords =
11053 NumValueDWords == 1
11054 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
11056 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
11057 ZeroIdx);
11058 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
11059 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11060 }
11061
11062 if (!Subtarget->hasDwordx3LoadStores() &&
11063 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11064 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
11065 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
11066 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
11067 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
11068 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
11069 WidenedMemVT, WidenedMMO);
11071 DAG.getVectorIdxConstant(0, DL));
11072 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
11073 }
11074
11075 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
11076}
11077
11078SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11079 bool ImageStore) const {
11080 EVT StoreVT = VData.getValueType();
11081
11082 // No change for f16 and legal vector D16 types.
11083 if (!StoreVT.isVector())
11084 return VData;
11085
11086 SDLoc DL(VData);
11087 unsigned NumElements = StoreVT.getVectorNumElements();
11088
11089 if (Subtarget->hasUnpackedD16VMem()) {
11090 // We need to unpack the packed data to store.
11091 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11092 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11093
11094 EVT EquivStoreVT =
11095 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11096 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11097 return DAG.UnrollVectorOp(ZExt.getNode());
11098 }
11099
11100 // The sq block of gfx8.1 does not estimate register use correctly for d16
11101 // image store instructions. The data operand is computed as if it were not a
11102 // d16 image instruction.
11103 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11104 // Bitcast to i16
11105 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11106 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11107
11108 // Decompose into scalars
11110 DAG.ExtractVectorElements(IntVData, Elts);
11111
11112 // Group pairs of i16 into v2i16 and bitcast to i32
11113 SmallVector<SDValue, 4> PackedElts;
11114 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11115 SDValue Pair =
11116 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11117 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11118 PackedElts.push_back(IntPair);
11119 }
11120 if ((NumElements % 2) == 1) {
11121 // Handle v3i16
11122 unsigned I = Elts.size() / 2;
11123 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11124 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11125 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11126 PackedElts.push_back(IntPair);
11127 }
11128
11129 // Pad using UNDEF
11130 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11131
11132 // Build final vector
11133 EVT VecVT =
11134 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11135 return DAG.getBuildVector(VecVT, DL, PackedElts);
11136 }
11137
11138 if (NumElements == 3) {
11139 EVT IntStoreVT =
11141 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11142
11143 EVT WidenedStoreVT = EVT::getVectorVT(
11144 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11145 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11146 WidenedStoreVT.getStoreSizeInBits());
11147 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11148 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11149 }
11150
11151 assert(isTypeLegal(StoreVT));
11152 return VData;
11153}
11154
11155SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11156 SelectionDAG &DAG) const {
11157 SDLoc DL(Op);
11158 SDValue Chain = Op.getOperand(0);
11159 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11160 MachineFunction &MF = DAG.getMachineFunction();
11161
11162 switch (IntrinsicID) {
11163 case Intrinsic::amdgcn_exp_compr: {
11164 if (!Subtarget->hasCompressedExport()) {
11165 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11167 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11168 }
11169 SDValue Src0 = Op.getOperand(4);
11170 SDValue Src1 = Op.getOperand(5);
11171 // Hack around illegal type on SI by directly selecting it.
11172 if (isTypeLegal(Src0.getValueType()))
11173 return SDValue();
11174
11175 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11176 SDValue Undef = DAG.getPOISON(MVT::f32);
11177 const SDValue Ops[] = {
11178 Op.getOperand(2), // tgt
11179 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11180 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11181 Undef, // src2
11182 Undef, // src3
11183 Op.getOperand(7), // vm
11184 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11185 Op.getOperand(3), // en
11186 Op.getOperand(0) // Chain
11187 };
11188
11189 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11190 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11191 }
11192
11193 case Intrinsic::amdgcn_struct_tbuffer_store:
11194 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11195 SDValue VData = Op.getOperand(2);
11196 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11197 if (IsD16)
11198 VData = handleD16VData(VData, DAG);
11199 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11200 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11201 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11202 SDValue Ops[] = {
11203 Chain,
11204 VData, // vdata
11205 Rsrc, // rsrc
11206 Op.getOperand(4), // vindex
11207 VOffset, // voffset
11208 SOffset, // soffset
11209 Offset, // offset
11210 Op.getOperand(7), // format
11211 Op.getOperand(8), // cachepolicy, swizzled buffer
11212 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11213 };
11214 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11215 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11216 MemSDNode *M = cast<MemSDNode>(Op);
11217 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11218 M->getMemoryVT(), M->getMemOperand());
11219 }
11220
11221 case Intrinsic::amdgcn_raw_tbuffer_store:
11222 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11223 SDValue VData = Op.getOperand(2);
11224 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11225 if (IsD16)
11226 VData = handleD16VData(VData, DAG);
11227 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11228 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11229 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11230 SDValue Ops[] = {
11231 Chain,
11232 VData, // vdata
11233 Rsrc, // rsrc
11234 DAG.getConstant(0, DL, MVT::i32), // vindex
11235 VOffset, // voffset
11236 SOffset, // soffset
11237 Offset, // offset
11238 Op.getOperand(6), // format
11239 Op.getOperand(7), // cachepolicy, swizzled buffer
11240 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11241 };
11242 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11243 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11244 MemSDNode *M = cast<MemSDNode>(Op);
11245 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11246 M->getMemoryVT(), M->getMemOperand());
11247 }
11248
11249 case Intrinsic::amdgcn_raw_buffer_store:
11250 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11251 case Intrinsic::amdgcn_raw_buffer_store_format:
11252 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11253 const bool IsFormat =
11254 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11255 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11256
11257 SDValue VData = Op.getOperand(2);
11258 EVT VDataVT = VData.getValueType();
11259 EVT EltType = VDataVT.getScalarType();
11260 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11261 if (IsD16) {
11262 VData = handleD16VData(VData, DAG);
11263 VDataVT = VData.getValueType();
11264 }
11265
11266 if (!isTypeLegal(VDataVT)) {
11267 VData =
11268 DAG.getNode(ISD::BITCAST, DL,
11269 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11270 }
11271
11272 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11273 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11274 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11275 SDValue Ops[] = {
11276 Chain,
11277 VData,
11278 Rsrc,
11279 DAG.getConstant(0, DL, MVT::i32), // vindex
11280 VOffset, // voffset
11281 SOffset, // soffset
11282 Offset, // offset
11283 Op.getOperand(6), // cachepolicy, swizzled buffer
11284 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11285 };
11286 unsigned Opc =
11287 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11288 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11289 MemSDNode *M = cast<MemSDNode>(Op);
11290
11291 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11292 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11293 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11294
11295 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11296 M->getMemoryVT(), M->getMemOperand());
11297 }
11298
11299 case Intrinsic::amdgcn_struct_buffer_store:
11300 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11301 case Intrinsic::amdgcn_struct_buffer_store_format:
11302 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11303 const bool IsFormat =
11304 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11305 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11306
11307 SDValue VData = Op.getOperand(2);
11308 EVT VDataVT = VData.getValueType();
11309 EVT EltType = VDataVT.getScalarType();
11310 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11311
11312 if (IsD16) {
11313 VData = handleD16VData(VData, DAG);
11314 VDataVT = VData.getValueType();
11315 }
11316
11317 if (!isTypeLegal(VDataVT)) {
11318 VData =
11319 DAG.getNode(ISD::BITCAST, DL,
11320 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11321 }
11322
11323 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11324 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11325 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11326 SDValue Ops[] = {
11327 Chain,
11328 VData,
11329 Rsrc,
11330 Op.getOperand(4), // vindex
11331 VOffset, // voffset
11332 SOffset, // soffset
11333 Offset, // offset
11334 Op.getOperand(7), // cachepolicy, swizzled buffer
11335 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11336 };
11337 unsigned Opc =
11338 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11339 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11340 MemSDNode *M = cast<MemSDNode>(Op);
11341
11342 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11343 EVT VDataType = VData.getValueType().getScalarType();
11344 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11345 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11346
11347 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11348 M->getMemoryVT(), M->getMemOperand());
11349 }
11350 case Intrinsic::amdgcn_raw_buffer_load_lds:
11351 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11352 case Intrinsic::amdgcn_struct_buffer_load_lds:
11353 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11354 if (!Subtarget->hasVMemToLDSLoad())
11355 return SDValue();
11356 unsigned Opc;
11357 bool HasVIndex =
11358 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11359 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11360 unsigned OpOffset = HasVIndex ? 1 : 0;
11361 SDValue VOffset = Op.getOperand(5 + OpOffset);
11362 bool HasVOffset = !isNullConstant(VOffset);
11363 unsigned Size = Op->getConstantOperandVal(4);
11364
11365 switch (Size) {
11366 default:
11367 return SDValue();
11368 case 1:
11369 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11370 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11371 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11372 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11373 break;
11374 case 2:
11375 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11376 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11377 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11378 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11379 break;
11380 case 4:
11381 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11382 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11383 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11384 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11385 break;
11386 case 12:
11387 if (!Subtarget->hasLDSLoadB96_B128())
11388 return SDValue();
11389 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11390 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11391 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11392 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11393 break;
11394 case 16:
11395 if (!Subtarget->hasLDSLoadB96_B128())
11396 return SDValue();
11397 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11398 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11399 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11400 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11401 break;
11402 }
11403
11404 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11405
11407
11408 if (HasVIndex && HasVOffset)
11409 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11410 {Op.getOperand(5), // VIndex
11411 VOffset}));
11412 else if (HasVIndex)
11413 Ops.push_back(Op.getOperand(5));
11414 else if (HasVOffset)
11415 Ops.push_back(VOffset);
11416
11417 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11418 Ops.push_back(Rsrc);
11419 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11420 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11421 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11422 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11423 Ops.push_back(DAG.getTargetConstant(
11424 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11425 DL, MVT::i8)); // cpol
11426 Ops.push_back(DAG.getTargetConstant(
11427 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11428 ? 1
11429 : 0,
11430 DL, MVT::i8)); // swz
11431 Ops.push_back(M0Val.getValue(0)); // Chain
11432 Ops.push_back(M0Val.getValue(1)); // Glue
11433
11434 auto *M = cast<MemSDNode>(Op);
11435 MachineMemOperand *LoadMMO = M->getMemOperand();
11436 // Don't set the offset value here because the pointer points to the base of
11437 // the buffer.
11438 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11439
11440 MachinePointerInfo StorePtrI = LoadPtrI;
11441 LoadPtrI.V = PoisonValue::get(
11445
11446 auto F = LoadMMO->getFlags() &
11448 LoadMMO =
11450 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11451
11452 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11453 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11454 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11455
11456 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11457 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11458
11459 return SDValue(Load, 0);
11460 }
11461 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11462 // for "trust me" that the remaining cases are global pointers until
11463 // such time as we can put two mem operands on an intrinsic.
11464 case Intrinsic::amdgcn_load_to_lds:
11465 case Intrinsic::amdgcn_global_load_lds: {
11466 if (!Subtarget->hasVMemToLDSLoad())
11467 return SDValue();
11468
11469 unsigned Opc;
11470 unsigned Size = Op->getConstantOperandVal(4);
11471 switch (Size) {
11472 default:
11473 return SDValue();
11474 case 1:
11475 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11476 break;
11477 case 2:
11478 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11479 break;
11480 case 4:
11481 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11482 break;
11483 case 12:
11484 if (!Subtarget->hasLDSLoadB96_B128())
11485 return SDValue();
11486 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11487 break;
11488 case 16:
11489 if (!Subtarget->hasLDSLoadB96_B128())
11490 return SDValue();
11491 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11492 break;
11493 }
11494
11495 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11496
11498
11499 SDValue Addr = Op.getOperand(2); // Global ptr
11500 SDValue VOffset;
11501 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11502 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11503 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11504 SDValue LHS = Addr.getOperand(0);
11505 SDValue RHS = Addr.getOperand(1);
11506
11507 if (LHS->isDivergent())
11508 std::swap(LHS, RHS);
11509
11510 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11511 RHS.getOperand(0).getValueType() == MVT::i32) {
11512 // add (i64 sgpr), (zero_extend (i32 vgpr))
11513 Addr = LHS;
11514 VOffset = RHS.getOperand(0);
11515 }
11516 }
11517
11518 Ops.push_back(Addr);
11519 if (!Addr->isDivergent()) {
11521 if (!VOffset)
11522 VOffset =
11523 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11524 DAG.getTargetConstant(0, DL, MVT::i32)),
11525 0);
11526 Ops.push_back(VOffset);
11527 }
11528
11529 Ops.push_back(Op.getOperand(5)); // Offset
11530
11531 unsigned Aux = Op.getConstantOperandVal(6);
11532 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11533 MVT::i32)); // CPol
11534
11535 Ops.push_back(M0Val.getValue(0)); // Chain
11536 Ops.push_back(M0Val.getValue(1)); // Glue
11537
11538 auto *M = cast<MemSDNode>(Op);
11539 MachineMemOperand *LoadMMO = M->getMemOperand();
11540 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11541 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11542 MachinePointerInfo StorePtrI = LoadPtrI;
11543 LoadPtrI.V = PoisonValue::get(
11547 auto F = LoadMMO->getFlags() &
11549 LoadMMO =
11551 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11552 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11553 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11554 LoadMMO->getAAInfo());
11555
11556 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11557 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11558
11559 return SDValue(Load, 0);
11560 }
11561 case Intrinsic::amdgcn_end_cf:
11562 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11563 Op->getOperand(2), Chain),
11564 0);
11565 case Intrinsic::amdgcn_s_barrier_init:
11566 case Intrinsic::amdgcn_s_barrier_signal_var: {
11567 // these two intrinsics have two operands: barrier pointer and member count
11568 SDValue Chain = Op->getOperand(0);
11570 SDValue BarOp = Op->getOperand(2);
11571 SDValue CntOp = Op->getOperand(3);
11572 SDValue M0Val;
11573 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11574 ? AMDGPU::S_BARRIER_INIT_M0
11575 : AMDGPU::S_BARRIER_SIGNAL_M0;
11576 // extract the BarrierID from bits 4-9 of BarOp
11577 SDValue BarID;
11578 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11579 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11580 BarID =
11581 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11582 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11583 0);
11584 // Member count should be put into M0[ShAmt:+6]
11585 // Barrier ID should be put into M0[5:0]
11586 M0Val =
11587 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11588 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11589 0);
11590 constexpr unsigned ShAmt = 16;
11591 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11592 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11593
11594 M0Val = SDValue(
11595 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11596
11597 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11598
11599 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11600 return SDValue(NewMI, 0);
11601 }
11602 case Intrinsic::amdgcn_s_wakeup_barrier: {
11603 if (!Subtarget->hasSWakeupBarrier())
11604 return SDValue();
11605 [[fallthrough]];
11606 }
11607 case Intrinsic::amdgcn_s_barrier_join: {
11608 // these three intrinsics have one operand: barrier pointer
11609 SDValue Chain = Op->getOperand(0);
11611 SDValue BarOp = Op->getOperand(2);
11612 unsigned Opc;
11613
11614 if (isa<ConstantSDNode>(BarOp)) {
11615 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11616 switch (IntrinsicID) {
11617 default:
11618 return SDValue();
11619 case Intrinsic::amdgcn_s_barrier_join:
11620 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11621 break;
11622 case Intrinsic::amdgcn_s_wakeup_barrier:
11623 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11624 break;
11625 }
11626 // extract the BarrierID from bits 4-9 of the immediate
11627 unsigned BarID = (BarVal >> 4) & 0x3F;
11628 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11629 Ops.push_back(K);
11630 Ops.push_back(Chain);
11631 } else {
11632 switch (IntrinsicID) {
11633 default:
11634 return SDValue();
11635 case Intrinsic::amdgcn_s_barrier_join:
11636 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11637 break;
11638 case Intrinsic::amdgcn_s_wakeup_barrier:
11639 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11640 break;
11641 }
11642 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11643 SDValue M0Val;
11644 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11645 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11646 M0Val =
11647 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11648 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11649 0);
11650 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11651 }
11652
11653 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11654 return SDValue(NewMI, 0);
11655 }
11656 case Intrinsic::amdgcn_s_prefetch_data: {
11657 // For non-global address space preserve the chain and remove the call.
11659 return Op.getOperand(0);
11660 return Op;
11661 }
11662 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11663 SDValue Ops[] = {
11664 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11665 Op.getOperand(3), // offset
11666 Op.getOperand(4), // length
11667 };
11668
11669 MemSDNode *M = cast<MemSDNode>(Op);
11670 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
11671 Op->getVTList(), Ops, M->getMemoryVT(),
11672 M->getMemOperand());
11673 }
11674 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11675 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11676 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11677 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11678 SDValue Chain = Op->getOperand(0);
11679 SDValue Ptr = Op->getOperand(2);
11680 SDValue Val = Op->getOperand(3);
11681 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11682 Ptr, MII->getMemOperand());
11683 }
11684 default: {
11685 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11687 return lowerImage(Op, ImageDimIntr, DAG, true);
11688
11689 return Op;
11690 }
11691 }
11692}
11693
11694// Return whether the operation has NoUnsignedWrap property.
11695static bool isNoUnsignedWrap(SDValue Addr) {
11696 return (Addr.getOpcode() == ISD::ADD &&
11697 Addr->getFlags().hasNoUnsignedWrap()) ||
11698 Addr->getOpcode() == ISD::OR;
11699}
11700
11702 EVT PtrVT) const {
11703 return PtrVT == MVT::i64;
11704}
11705
11707 EVT PtrVT) const {
11708 return true;
11709}
11710
11711// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11712// offset (the offset that is included in bounds checking and swizzling, to be
11713// split between the instruction's voffset and immoffset fields) and soffset
11714// (the offset that is excluded from bounds checking and swizzling, to go in
11715// the instruction's soffset field). This function takes the first kind of
11716// offset and figures out how to split it between voffset and immoffset.
11717std::pair<SDValue, SDValue>
11718SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11719 SDLoc DL(Offset);
11720 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11721 SDValue N0 = Offset;
11722 ConstantSDNode *C1 = nullptr;
11723
11724 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11725 N0 = SDValue();
11726 else if (DAG.isBaseWithConstantOffset(N0)) {
11727 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11728 // being added, so we can only safely match a 32-bit addition with no
11729 // unsigned overflow.
11730 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11731 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11732 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11733 N0 = N0.getOperand(0);
11734 }
11735 }
11736
11737 if (C1) {
11738 unsigned ImmOffset = C1->getZExtValue();
11739 // If the immediate value is too big for the immoffset field, put only bits
11740 // that would normally fit in the immoffset field. The remaining value that
11741 // is copied/added for the voffset field is a large power of 2, and it
11742 // stands more chance of being CSEd with the copy/add for another similar
11743 // load/store.
11744 // However, do not do that rounding down if that is a negative
11745 // number, as it appears to be illegal to have a negative offset in the
11746 // vgpr, even if adding the immediate offset makes it positive.
11747 unsigned Overflow = ImmOffset & ~MaxImm;
11748 ImmOffset -= Overflow;
11749 if ((int32_t)Overflow < 0) {
11750 Overflow += ImmOffset;
11751 ImmOffset = 0;
11752 }
11753 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11754 if (Overflow) {
11755 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11756 if (!N0)
11757 N0 = OverflowVal;
11758 else {
11759 SDValue Ops[] = {N0, OverflowVal};
11760 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11761 }
11762 }
11763 }
11764 if (!N0)
11765 N0 = DAG.getConstant(0, DL, MVT::i32);
11766 if (!C1)
11767 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11768 return {N0, SDValue(C1, 0)};
11769}
11770
11771// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11772// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11773// pointed to by Offsets.
11774void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11775 SelectionDAG &DAG, SDValue *Offsets,
11776 Align Alignment) const {
11777 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11778 SDLoc DL(CombinedOffset);
11779 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11780 uint32_t Imm = C->getZExtValue();
11781 uint32_t SOffset, ImmOffset;
11782 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11783 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11784 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11785 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11786 return;
11787 }
11788 }
11789 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11790 SDValue N0 = CombinedOffset.getOperand(0);
11791 SDValue N1 = CombinedOffset.getOperand(1);
11792 uint32_t SOffset, ImmOffset;
11793 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11794 if (Offset >= 0 &&
11795 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11796 Offsets[0] = N0;
11797 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11798 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11799 return;
11800 }
11801 }
11802
11803 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11804 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11805 : DAG.getConstant(0, DL, MVT::i32);
11806
11807 Offsets[0] = CombinedOffset;
11808 Offsets[1] = SOffsetZero;
11809 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11810}
11811
11812SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11813 SelectionDAG &DAG) const {
11814 if (!MaybePointer.getValueType().isScalarInteger())
11815 return MaybePointer;
11816
11817 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11818 return Rsrc;
11819}
11820
11821// Wrap a global or flat pointer into a buffer intrinsic using the flags
11822// specified in the intrinsic.
11823SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11824 SelectionDAG &DAG) const {
11825 SDLoc Loc(Op);
11826
11827 SDValue Pointer = Op->getOperand(1);
11828 SDValue Stride = Op->getOperand(2);
11829 SDValue NumRecords = Op->getOperand(3);
11830 SDValue Flags = Op->getOperand(4);
11831
11832 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11833 SDValue Rsrc;
11834
11835 if (Subtarget->has45BitNumRecordsBufferResource()) {
11836 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11837 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11838 // num_records.
11839 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11840 SDValue NumRecordsLHS =
11841 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11842 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11843 SDValue LowHalf =
11844 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11845
11846 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11847 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11848 SDValue NumRecordsRHS =
11849 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11850 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11851 SDValue ShiftedStride =
11852 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11853 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11854 SDValue ExtShiftedStrideVec =
11855 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11856 SDValue ExtShiftedStride =
11857 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11858 SDValue ShiftedFlags =
11859 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11860 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11861 SDValue ExtShiftedFlagsVec =
11862 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11863 SDValue ExtShiftedFlags =
11864 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11865 SDValue CombinedFields =
11866 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11867 SDValue HighHalf =
11868 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11869
11870 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11871 } else {
11872 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11873 auto [LowHalf, HighHalf] =
11874 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11875 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11876 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11877 SDValue ShiftedStride =
11878 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11879 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11880 SDValue NewHighHalf =
11881 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11882
11883 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11884 NumRecords, Flags);
11885 }
11886
11887 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11888 return RsrcPtr;
11889}
11890
11891// Handle 8 bit and 16 bit buffer loads
11892SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11893 EVT LoadVT, SDLoc DL,
11895 MachineMemOperand *MMO,
11896 bool IsTFE) const {
11897 EVT IntVT = LoadVT.changeTypeToInteger();
11898
11899 if (IsTFE) {
11900 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11901 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11902 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11903 MachineFunction &MF = DAG.getMachineFunction();
11904 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11905 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11906 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11907 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11908 DAG.getConstant(1, DL, MVT::i32));
11909 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11910 DAG.getConstant(0, DL, MVT::i32));
11911 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11912 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11913 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11914 }
11915
11916 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11917 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11918 : AMDGPUISD::BUFFER_LOAD_USHORT;
11919
11920 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11921 SDValue BufferLoad =
11922 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11923 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11924 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11925
11926 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11927}
11928
11929// Handle 8 bit and 16 bit buffer stores
11930SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11931 EVT VDataType, SDLoc DL,
11932 SDValue Ops[],
11933 MemSDNode *M) const {
11934 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11935 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11936
11937 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11938 Ops[1] = BufferStoreExt;
11939 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11940 : AMDGPUISD::BUFFER_STORE_SHORT;
11941 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11942 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11943 M->getMemOperand());
11944}
11945
11947 SDValue Op, const SDLoc &SL, EVT VT) {
11948 if (VT.bitsLT(Op.getValueType()))
11949 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11950
11951 switch (ExtType) {
11952 case ISD::SEXTLOAD:
11953 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11954 case ISD::ZEXTLOAD:
11955 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11956 case ISD::EXTLOAD:
11957 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11958 case ISD::NON_EXTLOAD:
11959 return Op;
11960 }
11961
11962 llvm_unreachable("invalid ext type");
11963}
11964
11965// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11966// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11967SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11968 DAGCombinerInfo &DCI) const {
11969 SelectionDAG &DAG = DCI.DAG;
11970 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11971 return SDValue();
11972
11973 // FIXME: Constant loads should all be marked invariant.
11974 unsigned AS = Ld->getAddressSpace();
11975 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11977 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11978 return SDValue();
11979
11980 // Don't do this early, since it may interfere with adjacent load merging for
11981 // illegal types. We can avoid losing alignment information for exotic types
11982 // pre-legalize.
11983 EVT MemVT = Ld->getMemoryVT();
11984 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11985 MemVT.getSizeInBits() >= 32)
11986 return SDValue();
11987
11988 SDLoc SL(Ld);
11989
11990 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11991 "unexpected vector extload");
11992
11993 // TODO: Drop only high part of range.
11994 SDValue Ptr = Ld->getBasePtr();
11995 SDValue NewLoad = DAG.getLoad(
11996 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11997 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11998 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11999 nullptr); // Drop ranges
12000
12001 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
12002 if (MemVT.isFloatingPoint()) {
12004 "unexpected fp extload");
12005 TruncVT = MemVT.changeTypeToInteger();
12006 }
12007
12008 SDValue Cvt = NewLoad;
12009 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12010 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
12011 DAG.getValueType(TruncVT));
12012 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12014 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
12015 } else {
12017 }
12018
12019 EVT VT = Ld->getValueType(0);
12020 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
12021
12022 DCI.AddToWorklist(Cvt.getNode());
12023
12024 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12025 // the appropriate extension from the 32-bit load.
12026 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
12027 DCI.AddToWorklist(Cvt.getNode());
12028
12029 // Handle conversion back to floating point if necessary.
12030 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
12031
12032 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
12033}
12034
12036 const SIMachineFunctionInfo &Info) {
12037 // TODO: Should check if the address can definitely not access stack.
12038 if (Info.isEntryFunction())
12039 return Info.getUserSGPRInfo().hasFlatScratchInit();
12040 return true;
12041}
12042
12043SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12044 SDLoc DL(Op);
12045 LoadSDNode *Load = cast<LoadSDNode>(Op);
12046 ISD::LoadExtType ExtType = Load->getExtensionType();
12047 EVT MemVT = Load->getMemoryVT();
12048 MachineMemOperand *MMO = Load->getMemOperand();
12049
12050 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12051 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
12052 return SDValue();
12053
12054 // FIXME: Copied from PPC
12055 // First, load into 32 bits, then truncate to 1 bit.
12056
12057 SDValue Chain = Load->getChain();
12058 SDValue BasePtr = Load->getBasePtr();
12059
12060 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12061
12062 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
12063 RealMemVT, MMO);
12064
12065 if (!MemVT.isVector()) {
12066 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
12067 NewLD.getValue(1)};
12068
12069 return DAG.getMergeValues(Ops, DL);
12070 }
12071
12073 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12074 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
12075 DAG.getConstant(I, DL, MVT::i32));
12076
12077 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12078 }
12079
12080 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12081
12082 return DAG.getMergeValues(Ops, DL);
12083 }
12084
12085 if (!MemVT.isVector())
12086 return SDValue();
12087
12088 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12089 "Custom lowering for non-i32 vectors hasn't been implemented.");
12090
12091 Align Alignment = Load->getAlign();
12092 unsigned AS = Load->getAddressSpace();
12093 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12094 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12095 return SplitVectorLoad(Op, DAG);
12096 }
12097
12098 MachineFunction &MF = DAG.getMachineFunction();
12099 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12100 // If there is a possibility that flat instruction access scratch memory
12101 // then we need to use the same legalization rules we use for private.
12102 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12103 !Subtarget->hasMultiDwordFlatScratchAddressing())
12104 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12107
12108 unsigned NumElements = MemVT.getVectorNumElements();
12109
12110 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12112 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12113 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12114 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12115 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12116 Alignment >= Align(4) && NumElements < 32) {
12117 if (MemVT.isPow2VectorType() ||
12118 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12119 return SDValue();
12120 return WidenOrSplitVectorLoad(Op, DAG);
12121 }
12122 // Non-uniform loads will be selected to MUBUF instructions, so they
12123 // have the same legalization requirements as global and private
12124 // loads.
12125 //
12126 }
12127 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12130 if (NumElements > 4)
12131 return SplitVectorLoad(Op, DAG);
12132 // v3 loads not supported on SI.
12133 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12134 return WidenOrSplitVectorLoad(Op, DAG);
12135
12136 // v3 and v4 loads are supported for private and global memory.
12137 return SDValue();
12138 }
12139 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12140 // Depending on the setting of the private_element_size field in the
12141 // resource descriptor, we can only make private accesses up to a certain
12142 // size.
12143 switch (Subtarget->getMaxPrivateElementSize()) {
12144 case 4: {
12145 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12146 return DAG.getMergeValues({Op0, Op1}, DL);
12147 }
12148 case 8:
12149 if (NumElements > 2)
12150 return SplitVectorLoad(Op, DAG);
12151 return SDValue();
12152 case 16:
12153 // Same as global/flat
12154 if (NumElements > 4)
12155 return SplitVectorLoad(Op, DAG);
12156 // v3 loads not supported on SI.
12157 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12158 return WidenOrSplitVectorLoad(Op, DAG);
12159
12160 return SDValue();
12161 default:
12162 llvm_unreachable("unsupported private_element_size");
12163 }
12164 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12165 unsigned Fast = 0;
12166 auto Flags = Load->getMemOperand()->getFlags();
12168 Load->getAlign(), Flags, &Fast) &&
12169 Fast > 1)
12170 return SDValue();
12171
12172 if (MemVT.isVector())
12173 return SplitVectorLoad(Op, DAG);
12174 }
12175
12177 MemVT, *Load->getMemOperand())) {
12178 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12179 return DAG.getMergeValues({Op0, Op1}, DL);
12180 }
12181
12182 return SDValue();
12183}
12184
12185SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12186 EVT VT = Op.getValueType();
12187 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12188 VT.getSizeInBits() == 512)
12189 return splitTernaryVectorOp(Op, DAG);
12190
12191 assert(VT.getSizeInBits() == 64);
12192
12193 SDLoc DL(Op);
12194 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12195
12196 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12197 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12198
12199 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12200 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12201
12202 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12203 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12204
12205 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12206
12207 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12208 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12209
12210 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12211
12212 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12213 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12214}
12215
12216// Catch division cases where we can use shortcuts with rcp and rsq
12217// instructions.
12218SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12219 SelectionDAG &DAG) const {
12220 SDLoc SL(Op);
12221 SDValue LHS = Op.getOperand(0);
12222 SDValue RHS = Op.getOperand(1);
12223 EVT VT = Op.getValueType();
12224 const SDNodeFlags Flags = Op->getFlags();
12225
12226 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12227
12228 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12229 // Without !fpmath accuracy information, we can't do more because we don't
12230 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12231 // f16 is always accurate enough
12232 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12233 return SDValue();
12234
12235 if (CLHS->isExactlyValue(1.0)) {
12236 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12237 // the CI documentation has a worst case error of 1 ulp.
12238 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12239 // use it as long as we aren't trying to use denormals.
12240 //
12241 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12242
12243 // 1.0 / sqrt(x) -> rsq(x)
12244
12245 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12246 // error seems really high at 2^29 ULP.
12247 // 1.0 / x -> rcp(x)
12248 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12249 }
12250
12251 // Same as for 1.0, but expand the sign out of the constant.
12252 if (CLHS->isExactlyValue(-1.0)) {
12253 // -1.0 / x -> rcp (fneg x)
12254 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12255 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12256 }
12257 }
12258
12259 // For f16 and bf16 require afn or arcp.
12260 // For f32 require afn.
12261 if (!AllowInaccurateRcp &&
12262 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12263 return SDValue();
12264
12265 // Turn into multiply by the reciprocal.
12266 // x / y -> x * (1.0 / y)
12267 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12268 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12269}
12270
12271SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12272 SelectionDAG &DAG) const {
12273 SDLoc SL(Op);
12274 SDValue X = Op.getOperand(0);
12275 SDValue Y = Op.getOperand(1);
12276 EVT VT = Op.getValueType();
12277 const SDNodeFlags Flags = Op->getFlags();
12278
12279 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12280 if (!AllowInaccurateDiv)
12281 return SDValue();
12282
12283 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12284 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12285
12286 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12287 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12288
12289 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12290 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12291 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12292 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12293 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12294 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12295}
12296
12297static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12298 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12299 SDNodeFlags Flags) {
12300 if (GlueChain->getNumValues() <= 1) {
12301 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12302 }
12303
12304 assert(GlueChain->getNumValues() == 3);
12305
12306 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12307 switch (Opcode) {
12308 default:
12309 llvm_unreachable("no chain equivalent for opcode");
12310 case ISD::FMUL:
12311 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12312 break;
12313 }
12314
12315 return DAG.getNode(Opcode, SL, VTList,
12316 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12317 Flags);
12318}
12319
12320static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12321 EVT VT, SDValue A, SDValue B, SDValue C,
12322 SDValue GlueChain, SDNodeFlags Flags) {
12323 if (GlueChain->getNumValues() <= 1) {
12324 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12325 }
12326
12327 assert(GlueChain->getNumValues() == 3);
12328
12329 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12330 switch (Opcode) {
12331 default:
12332 llvm_unreachable("no chain equivalent for opcode");
12333 case ISD::FMA:
12334 Opcode = AMDGPUISD::FMA_W_CHAIN;
12335 break;
12336 }
12337
12338 return DAG.getNode(Opcode, SL, VTList,
12339 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12340 Flags);
12341}
12342
12343SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12344 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12345 return FastLowered;
12346
12347 SDLoc SL(Op);
12348 EVT VT = Op.getValueType();
12349 SDValue LHS = Op.getOperand(0);
12350 SDValue RHS = Op.getOperand(1);
12351
12352 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12353 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12354
12355 if (VT == MVT::bf16) {
12356 SDValue ExtDiv =
12357 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12358 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12359 DAG.getTargetConstant(0, SL, MVT::i32));
12360 }
12361
12362 assert(VT == MVT::f16);
12363
12364 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12365 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12366 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12367 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12368 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12369 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12370 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12371 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12372 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12373 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12374 // q16.u = opx(V_CVT_F16_F32, q32.u);
12375 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12376
12377 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12378 unsigned FMADOpCode =
12380 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12381 SDValue Rcp =
12382 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12383 SDValue Quot =
12384 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12385 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12386 Op->getFlags());
12387 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12388 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12389 Op->getFlags());
12390 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12391 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12392 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12393 DAG.getConstant(0xff800000, SL, MVT::i32));
12394 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12395 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12396 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12397 DAG.getTargetConstant(0, SL, MVT::i32));
12398 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12399 Op->getFlags());
12400}
12401
12402// Faster 2.5 ULP division that does not support denormals.
12403SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12404 SDNodeFlags Flags = Op->getFlags();
12405 SDLoc SL(Op);
12406 SDValue LHS = Op.getOperand(1);
12407 SDValue RHS = Op.getOperand(2);
12408
12409 // TODO: The combiner should probably handle elimination of redundant fabs.
12411 ? RHS
12412 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12413
12414 const APFloat K0Val(0x1p+96f);
12415 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12416
12417 const APFloat K1Val(0x1p-32f);
12418 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12419
12420 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12421
12422 EVT SetCCVT =
12423 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12424
12425 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12426
12427 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12428
12429 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12430
12431 // rcp does not support denormals.
12432 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12433
12434 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12435
12436 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12437}
12438
12439// Returns immediate value for setting the F32 denorm mode when using the
12440// S_DENORM_MODE instruction.
12443 const GCNSubtarget *ST) {
12444 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12445 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12446 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12447 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12448}
12449
12450SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12451 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12452 return FastLowered;
12453
12454 // The selection matcher assumes anything with a chain selecting to a
12455 // mayRaiseFPException machine instruction. Since we're introducing a chain
12456 // here, we need to explicitly report nofpexcept for the regular fdiv
12457 // lowering.
12458 SDNodeFlags Flags = Op->getFlags();
12459 Flags.setNoFPExcept(true);
12460
12461 SDLoc SL(Op);
12462 SDValue LHS = Op.getOperand(0);
12463 SDValue RHS = Op.getOperand(1);
12464
12465 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12466
12467 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12468
12469 SDValue DenominatorScaled =
12470 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12471 SDValue NumeratorScaled =
12472 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12473
12474 // Denominator is scaled to not be denormal, so using rcp is ok.
12475 SDValue ApproxRcp =
12476 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12477 SDValue NegDivScale0 =
12478 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12479
12480 using namespace AMDGPU::Hwreg;
12481 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12482 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12483
12484 const MachineFunction &MF = DAG.getMachineFunction();
12485 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12486 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12487
12488 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12489 const bool HasDynamicDenormals =
12490 (DenormMode.Input == DenormalMode::Dynamic) ||
12491 (DenormMode.Output == DenormalMode::Dynamic);
12492
12493 SDValue SavedDenormMode;
12494
12495 if (!PreservesDenormals) {
12496 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12497 // lowering. The chain dependence is insufficient, and we need glue. We do
12498 // not need the glue variants in a strictfp function.
12499
12500 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12501
12502 SDValue Glue = DAG.getEntryNode();
12503 if (HasDynamicDenormals) {
12504 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12505 DAG.getVTList(MVT::i32, MVT::Glue),
12506 {BitField, Glue});
12507 SavedDenormMode = SDValue(GetReg, 0);
12508
12509 Glue = DAG.getMergeValues(
12510 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12511 }
12512
12513 SDNode *EnableDenorm;
12514 if (Subtarget->hasDenormModeInst()) {
12515 const SDValue EnableDenormValue =
12517
12518 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12519 EnableDenormValue)
12520 .getNode();
12521 } else {
12522 const SDValue EnableDenormValue =
12523 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12524 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12525 {EnableDenormValue, BitField, Glue});
12526 }
12527
12528 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12529 SDValue(EnableDenorm, 1)};
12530
12531 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12532 }
12533
12534 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12535 ApproxRcp, One, NegDivScale0, Flags);
12536
12537 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12538 ApproxRcp, Fma0, Flags);
12539
12540 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12541 Fma1, Flags);
12542
12543 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12544 NumeratorScaled, Mul, Flags);
12545
12546 SDValue Fma3 =
12547 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12548
12549 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12550 NumeratorScaled, Fma3, Flags);
12551
12552 if (!PreservesDenormals) {
12553 SDNode *DisableDenorm;
12554 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12555 const SDValue DisableDenormValue = getSPDenormModeValue(
12556 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12557
12558 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12559 DisableDenorm =
12560 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12561 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12562 .getNode();
12563 } else {
12564 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12565 const SDValue DisableDenormValue =
12566 HasDynamicDenormals
12567 ? SavedDenormMode
12568 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12569
12570 DisableDenorm = DAG.getMachineNode(
12571 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12572 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12573 }
12574
12575 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12576 SDValue(DisableDenorm, 0), DAG.getRoot());
12577 DAG.setRoot(OutputChain);
12578 }
12579
12580 SDValue Scale = NumeratorScaled.getValue(1);
12581 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12582 {Fma4, Fma1, Fma3, Scale}, Flags);
12583
12584 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12585}
12586
12587SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12588 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12589 return FastLowered;
12590
12591 SDLoc SL(Op);
12592 SDValue X = Op.getOperand(0);
12593 SDValue Y = Op.getOperand(1);
12594
12595 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12596
12597 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12598
12599 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12600
12601 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12602
12603 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12604
12605 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12606
12607 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12608
12609 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12610
12611 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12612
12613 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12614 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12615
12616 SDValue Fma4 =
12617 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12618
12619 SDValue Scale;
12620
12621 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12622 // Workaround a hardware bug on SI where the condition output from div_scale
12623 // is not usable.
12624
12625 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12626
12627 // Figure out if the scale to use for div_fmas.
12628 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12629 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12630 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12631 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12632
12633 SDValue NumHi =
12634 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12635 SDValue DenHi =
12636 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12637
12638 SDValue Scale0Hi =
12639 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12640 SDValue Scale1Hi =
12641 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12642
12643 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12644 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12645 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12646 } else {
12647 Scale = DivScale1.getValue(1);
12648 }
12649
12650 SDValue Fmas =
12651 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12652
12653 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12654}
12655
12656SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12657 EVT VT = Op.getValueType();
12658
12659 if (VT == MVT::f32)
12660 return LowerFDIV32(Op, DAG);
12661
12662 if (VT == MVT::f64)
12663 return LowerFDIV64(Op, DAG);
12664
12665 if (VT == MVT::f16 || VT == MVT::bf16)
12666 return LowerFDIV16(Op, DAG);
12667
12668 llvm_unreachable("Unexpected type for fdiv");
12669}
12670
12671SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12672 SDLoc dl(Op);
12673 SDValue Val = Op.getOperand(0);
12674 EVT VT = Val.getValueType();
12675 EVT ResultExpVT = Op->getValueType(1);
12676 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12677
12678 SDValue Mant = DAG.getNode(
12680 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12681
12682 SDValue Exp = DAG.getNode(
12683 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12684 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12685
12686 if (Subtarget->hasFractBug()) {
12687 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12688 SDValue Inf =
12690
12691 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12692 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12693 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12694 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12695 }
12696
12697 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12698 return DAG.getMergeValues({Mant, CastExp}, dl);
12699}
12700
12701SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12702 SDLoc DL(Op);
12703 StoreSDNode *Store = cast<StoreSDNode>(Op);
12704 EVT VT = Store->getMemoryVT();
12705
12706 if (VT == MVT::i1) {
12707 return DAG.getTruncStore(
12708 Store->getChain(), DL,
12709 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12710 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12711 }
12712
12713 assert(VT.isVector() &&
12714 Store->getValue().getValueType().getScalarType() == MVT::i32);
12715
12716 unsigned AS = Store->getAddressSpace();
12717 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12718 Store->getAlign().value() < VT.getStoreSize() &&
12719 VT.getSizeInBits() > 32) {
12720 return SplitVectorStore(Op, DAG);
12721 }
12722
12723 MachineFunction &MF = DAG.getMachineFunction();
12724 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12725 // If there is a possibility that flat instruction access scratch memory
12726 // then we need to use the same legalization rules we use for private.
12727 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12728 !Subtarget->hasMultiDwordFlatScratchAddressing())
12729 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12732
12733 unsigned NumElements = VT.getVectorNumElements();
12735 if (NumElements > 4)
12736 return SplitVectorStore(Op, DAG);
12737 // v3 stores not supported on SI.
12738 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12739 return SplitVectorStore(Op, DAG);
12740
12742 VT, *Store->getMemOperand()))
12743 return expandUnalignedStore(Store, DAG);
12744
12745 return SDValue();
12746 }
12747 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12748 switch (Subtarget->getMaxPrivateElementSize()) {
12749 case 4:
12750 return scalarizeVectorStore(Store, DAG);
12751 case 8:
12752 if (NumElements > 2)
12753 return SplitVectorStore(Op, DAG);
12754 return SDValue();
12755 case 16:
12756 if (NumElements > 4 ||
12757 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12758 return SplitVectorStore(Op, DAG);
12759 return SDValue();
12760 default:
12761 llvm_unreachable("unsupported private_element_size");
12762 }
12763 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12764 unsigned Fast = 0;
12765 auto Flags = Store->getMemOperand()->getFlags();
12767 Store->getAlign(), Flags, &Fast) &&
12768 Fast > 1)
12769 return SDValue();
12770
12771 if (VT.isVector())
12772 return SplitVectorStore(Op, DAG);
12773
12774 return expandUnalignedStore(Store, DAG);
12775 }
12776
12777 // Probably an invalid store. If so we'll end up emitting a selection error.
12778 return SDValue();
12779}
12780
12781// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12782SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12783 SDLoc SL(Op);
12784 assert(!Subtarget->has16BitInsts());
12785 SDNodeFlags Flags = Op->getFlags();
12786 SDValue Ext =
12787 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12788
12789 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12790 SDValue Sqrt =
12791 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12792
12793 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12794 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12795}
12796
12797SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12798 SDLoc DL(Op);
12799 SDNodeFlags Flags = Op->getFlags();
12800 MVT VT = Op.getValueType().getSimpleVT();
12801 const SDValue X = Op.getOperand(0);
12802
12803 if (allowApproxFunc(DAG, Flags)) {
12804 // Instruction is 1ulp but ignores denormals.
12805 return DAG.getNode(
12807 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12808 }
12809
12810 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12811 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12812
12813 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12814
12815 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12816
12817 SDValue SqrtX =
12818 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12819
12820 SDValue SqrtS;
12821 if (needsDenormHandlingF32(DAG, X, Flags)) {
12822 SDValue SqrtID =
12823 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12824 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12825
12826 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12827 SDValue SqrtSNextDownInt =
12828 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12829 DAG.getAllOnesConstant(DL, MVT::i32));
12830 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12831
12832 SDValue NegSqrtSNextDown =
12833 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12834
12835 SDValue SqrtVP =
12836 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12837
12838 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12839 DAG.getConstant(1, DL, MVT::i32));
12840 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12841
12842 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12843 SDValue SqrtVS =
12844 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12845
12846 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12847 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12848
12849 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12850 Flags);
12851
12852 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12853 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12854 Flags);
12855 } else {
12856 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12857
12858 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12859
12860 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12861 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12862 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12863
12864 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12865 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12866 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12867
12868 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12869 SDValue SqrtD =
12870 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12871 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12872 }
12873
12874 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12875
12876 SDValue ScaledDown =
12877 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12878
12879 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12880 SDValue IsZeroOrInf =
12881 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12882 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12883
12884 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12885}
12886
12887SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12888 // For double type, the SQRT and RSQ instructions don't have required
12889 // precision, we apply Goldschmidt's algorithm to improve the result:
12890 //
12891 // y0 = rsq(x)
12892 // g0 = x * y0
12893 // h0 = 0.5 * y0
12894 //
12895 // r0 = 0.5 - h0 * g0
12896 // g1 = g0 * r0 + g0
12897 // h1 = h0 * r0 + h0
12898 //
12899 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12900 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12901 // h2 = h1 * r1 + h1
12902 //
12903 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12904 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12905 //
12906 // sqrt(x) = g3
12907
12908 SDNodeFlags Flags = Op->getFlags();
12909
12910 SDLoc DL(Op);
12911
12912 SDValue X = Op.getOperand(0);
12913 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12914
12915 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12916
12917 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12918
12919 // Scale up input if it is too small.
12920 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12921 SDValue ScaleUp =
12922 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12923 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12924
12925 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12926
12927 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12928
12929 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12930 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12931
12932 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12933 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12934
12935 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12936
12937 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12938
12939 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12940 SDValue SqrtD0 =
12941 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12942
12943 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12944
12945 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12946 SDValue SqrtD1 =
12947 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12948
12949 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12950
12951 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12952 SDValue ScaleDown =
12953 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12954 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12955
12956 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12957 // with finite only or nsz because rsq(+/-0) = +/-inf
12958
12959 // TODO: Check for DAZ and expand to subnormals
12960 SDValue IsZeroOrInf =
12961 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12962 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12963
12964 // If x is +INF, +0, or -0, use its original value
12965 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12966 Flags);
12967}
12968
12969SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12970 SDLoc DL(Op);
12971 EVT VT = Op.getValueType();
12972 SDValue Arg = Op.getOperand(0);
12973 SDValue TrigVal;
12974
12975 // Propagate fast-math flags so that the multiply we introduce can be folded
12976 // if Arg is already the result of a multiply by constant.
12977 auto Flags = Op->getFlags();
12978
12979 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12980
12981 if (Subtarget->hasTrigReducedRange()) {
12982 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12983 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12984 } else {
12985 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12986 }
12987
12988 switch (Op.getOpcode()) {
12989 case ISD::FCOS:
12990 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12991 case ISD::FSIN:
12992 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12993 default:
12994 llvm_unreachable("Wrong trig opcode");
12995 }
12996}
12997
12998SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12999 SelectionDAG &DAG) const {
13000 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
13001 assert(AtomicNode->isCompareAndSwap());
13002 unsigned AS = AtomicNode->getAddressSpace();
13003
13004 // No custom lowering required for local address space
13006 return Op;
13007
13008 // Non-local address space requires custom lowering for atomic compare
13009 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13010 SDLoc DL(Op);
13011 SDValue ChainIn = Op.getOperand(0);
13012 SDValue Addr = Op.getOperand(1);
13013 SDValue Old = Op.getOperand(2);
13014 SDValue New = Op.getOperand(3);
13015 EVT VT = Op.getValueType();
13016 MVT SimpleVT = VT.getSimpleVT();
13017 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
13018
13019 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
13020 SDValue Ops[] = {ChainIn, Addr, NewOld};
13021
13022 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
13023 Op->getVTList(), Ops, VT,
13024 AtomicNode->getMemOperand());
13025}
13026
13027//===----------------------------------------------------------------------===//
13028// Custom DAG optimizations
13029//===----------------------------------------------------------------------===//
13030
13031SDValue
13032SITargetLowering::performUCharToFloatCombine(SDNode *N,
13033 DAGCombinerInfo &DCI) const {
13034 EVT VT = N->getValueType(0);
13035 EVT ScalarVT = VT.getScalarType();
13036 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13037 return SDValue();
13038
13039 SelectionDAG &DAG = DCI.DAG;
13040 SDLoc DL(N);
13041
13042 SDValue Src = N->getOperand(0);
13043 EVT SrcVT = Src.getValueType();
13044
13045 // TODO: We could try to match extracting the higher bytes, which would be
13046 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13047 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13048 // about in practice.
13049 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13050 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
13051 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
13052 DCI.AddToWorklist(Cvt.getNode());
13053
13054 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13055 if (ScalarVT != MVT::f32) {
13056 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
13057 DAG.getTargetConstant(0, DL, MVT::i32));
13058 }
13059 return Cvt;
13060 }
13061 }
13062
13063 return SDValue();
13064}
13065
13066SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13067 DAGCombinerInfo &DCI) const {
13068 SDValue MagnitudeOp = N->getOperand(0);
13069 SDValue SignOp = N->getOperand(1);
13070
13071 // The generic combine for fcopysign + fp cast is too conservative with
13072 // vectors, and also gets confused by the splitting we will perform here, so
13073 // peek through FP casts.
13074 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13075 SignOp.getOpcode() == ISD::FP_ROUND)
13076 SignOp = SignOp.getOperand(0);
13077
13078 SelectionDAG &DAG = DCI.DAG;
13079 SDLoc DL(N);
13080 EVT SignVT = SignOp.getValueType();
13081
13082 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13083 // lower half with a copy.
13084 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13085 EVT MagVT = MagnitudeOp.getValueType();
13086
13087 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13088
13089 if (MagVT.getScalarType() == MVT::f64) {
13090 EVT F32VT = MagVT.isVector()
13091 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13092 : MVT::v2f32;
13093
13094 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13095
13097 for (unsigned I = 0; I != NumElts; ++I) {
13098 SDValue MagLo =
13099 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13100 DAG.getConstant(2 * I, DL, MVT::i32));
13101 SDValue MagHi =
13102 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13103 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13104
13105 SDValue SignOpElt =
13106 MagVT.isVector()
13108 SignOp, DAG.getConstant(I, DL, MVT::i32))
13109 : SignOp;
13110
13111 SDValue HiOp =
13112 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13113
13114 SDValue Vector =
13115 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13116
13117 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13118 NewElts.push_back(NewElt);
13119 }
13120
13121 if (NewElts.size() == 1)
13122 return NewElts[0];
13123
13124 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13125 }
13126
13127 if (SignVT.getScalarType() != MVT::f64)
13128 return SDValue();
13129
13130 // Reduce width of sign operand, we only need the highest bit.
13131 //
13132 // fcopysign f64:x, f64:y ->
13133 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13134 // TODO: In some cases it might make sense to go all the way to f16.
13135
13136 EVT F32VT = MagVT.isVector()
13137 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13138 : MVT::v2f32;
13139
13140 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13141
13142 SmallVector<SDValue, 8> F32Signs;
13143 for (unsigned I = 0; I != NumElts; ++I) {
13144 // Take sign from odd elements of cast vector
13145 SDValue SignAsF32 =
13146 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13147 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13148 F32Signs.push_back(SignAsF32);
13149 }
13150
13151 SDValue NewSign =
13152 NumElts == 1
13153 ? F32Signs.back()
13155 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13156 F32Signs);
13157
13158 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13159 NewSign);
13160}
13161
13162// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13163// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13164// bits
13165
13166// This is a variant of
13167// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13168//
13169// The normal DAG combiner will do this, but only if the add has one use since
13170// that would increase the number of instructions.
13171//
13172// This prevents us from seeing a constant offset that can be folded into a
13173// memory instruction's addressing mode. If we know the resulting add offset of
13174// a pointer can be folded into an addressing offset, we can replace the pointer
13175// operand with the add of new constant offset. This eliminates one of the uses,
13176// and may allow the remaining use to also be simplified.
13177//
13178SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13179 EVT MemVT,
13180 DAGCombinerInfo &DCI) const {
13181 SDValue N0 = N->getOperand(0);
13182 SDValue N1 = N->getOperand(1);
13183
13184 // We only do this to handle cases where it's profitable when there are
13185 // multiple uses of the add, so defer to the standard combine.
13186 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13187 return SDValue();
13188
13189 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13190 if (!CN1)
13191 return SDValue();
13192
13193 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13194 if (!CAdd)
13195 return SDValue();
13196
13197 SelectionDAG &DAG = DCI.DAG;
13198
13199 if (N0->getOpcode() == ISD::OR &&
13200 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13201 return SDValue();
13202
13203 // If the resulting offset is too large, we can't fold it into the
13204 // addressing mode offset.
13205 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13206 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13207
13208 AddrMode AM;
13209 AM.HasBaseReg = true;
13210 AM.BaseOffs = Offset.getSExtValue();
13211 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13212 return SDValue();
13213
13214 SDLoc SL(N);
13215 EVT VT = N->getValueType(0);
13216
13217 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13218 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13219
13220 SDNodeFlags Flags;
13221 Flags.setNoUnsignedWrap(
13222 N->getFlags().hasNoUnsignedWrap() &&
13223 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13224
13225 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13226 // be sure that the new left operand is a proper base pointer.
13227 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13228}
13229
13230/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13231/// by the chain and intrinsic ID. Theoretically we would also need to check the
13232/// specific intrinsic, but they all place the pointer operand first.
13233static unsigned getBasePtrIndex(const MemSDNode *N) {
13234 switch (N->getOpcode()) {
13235 case ISD::STORE:
13238 return 2;
13239 default:
13240 return 1;
13241 }
13242}
13243
13244SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13245 DAGCombinerInfo &DCI) const {
13246 SelectionDAG &DAG = DCI.DAG;
13247
13248 unsigned PtrIdx = getBasePtrIndex(N);
13249 SDValue Ptr = N->getOperand(PtrIdx);
13250
13251 // TODO: We could also do this for multiplies.
13252 if (Ptr.getOpcode() == ISD::SHL) {
13253 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13254 N->getMemoryVT(), DCI);
13255 if (NewPtr) {
13256 SmallVector<SDValue, 8> NewOps(N->ops());
13257
13258 NewOps[PtrIdx] = NewPtr;
13259 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13260 }
13261 }
13262
13263 return SDValue();
13264}
13265
13266static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13267 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13268 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13269 (Opc == ISD::XOR && Val == 0);
13270}
13271
13272// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13273// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13274// integer combine opportunities since most 64-bit operations are decomposed
13275// this way. TODO: We won't want this for SALU especially if it is an inline
13276// immediate.
13277SDValue SITargetLowering::splitBinaryBitConstantOp(
13278 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13279 const ConstantSDNode *CRHS) const {
13280 uint64_t Val = CRHS->getZExtValue();
13281 uint32_t ValLo = Lo_32(Val);
13282 uint32_t ValHi = Hi_32(Val);
13283 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13284
13285 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13287 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13288 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13289 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13290 !CRHS->user_begin()->isDivergent())
13291 return SDValue();
13292
13293 // If we need to materialize a 64-bit immediate, it will be split up later
13294 // anyway. Avoid creating the harder to understand 64-bit immediate
13295 // materialization.
13296 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13297 }
13298
13299 return SDValue();
13300}
13301
13303 if (V.getValueType() != MVT::i1)
13304 return false;
13305 switch (V.getOpcode()) {
13306 default:
13307 break;
13308 case ISD::SETCC:
13309 case ISD::IS_FPCLASS:
13310 case AMDGPUISD::FP_CLASS:
13311 return true;
13312 case ISD::AND:
13313 case ISD::OR:
13314 case ISD::XOR:
13315 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13316 case ISD::SADDO:
13317 case ISD::UADDO:
13318 case ISD::SSUBO:
13319 case ISD::USUBO:
13320 case ISD::SMULO:
13321 case ISD::UMULO:
13322 return V.getResNo() == 1;
13324 unsigned IntrinsicID = V.getConstantOperandVal(0);
13325 switch (IntrinsicID) {
13326 case Intrinsic::amdgcn_is_shared:
13327 case Intrinsic::amdgcn_is_private:
13328 return true;
13329 default:
13330 return false;
13331 }
13332
13333 return false;
13334 }
13335 }
13336 return false;
13337}
13338
13339// If a constant has all zeroes or all ones within each byte return it.
13340// Otherwise return 0.
13342 // 0xff for any zero byte in the mask
13343 uint32_t ZeroByteMask = 0;
13344 if (!(C & 0x000000ff))
13345 ZeroByteMask |= 0x000000ff;
13346 if (!(C & 0x0000ff00))
13347 ZeroByteMask |= 0x0000ff00;
13348 if (!(C & 0x00ff0000))
13349 ZeroByteMask |= 0x00ff0000;
13350 if (!(C & 0xff000000))
13351 ZeroByteMask |= 0xff000000;
13352 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13353 if ((NonZeroByteMask & C) != NonZeroByteMask)
13354 return 0; // Partial bytes selected.
13355 return C;
13356}
13357
13358// Check if a node selects whole bytes from its operand 0 starting at a byte
13359// boundary while masking the rest. Returns select mask as in the v_perm_b32
13360// or -1 if not succeeded.
13361// Note byte select encoding:
13362// value 0-3 selects corresponding source byte;
13363// value 0xc selects zero;
13364// value 0xff selects 0xff.
13366 assert(V.getValueSizeInBits() == 32);
13367
13368 if (V.getNumOperands() != 2)
13369 return ~0;
13370
13371 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13372 if (!N1)
13373 return ~0;
13374
13375 uint32_t C = N1->getZExtValue();
13376
13377 switch (V.getOpcode()) {
13378 default:
13379 break;
13380 case ISD::AND:
13381 if (uint32_t ConstMask = getConstantPermuteMask(C))
13382 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13383 break;
13384
13385 case ISD::OR:
13386 if (uint32_t ConstMask = getConstantPermuteMask(C))
13387 return (0x03020100 & ~ConstMask) | ConstMask;
13388 break;
13389
13390 case ISD::SHL:
13391 if (C % 8)
13392 return ~0;
13393
13394 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13395
13396 case ISD::SRL:
13397 if (C % 8)
13398 return ~0;
13399
13400 return uint32_t(0x0c0c0c0c03020100ull >> C);
13401 }
13402
13403 return ~0;
13404}
13405
13406SDValue SITargetLowering::performAndCombine(SDNode *N,
13407 DAGCombinerInfo &DCI) const {
13408 if (DCI.isBeforeLegalize())
13409 return SDValue();
13410
13411 SelectionDAG &DAG = DCI.DAG;
13412 EVT VT = N->getValueType(0);
13413 SDValue LHS = N->getOperand(0);
13414 SDValue RHS = N->getOperand(1);
13415
13416 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13417 if (VT == MVT::i64 && CRHS) {
13418 if (SDValue Split =
13419 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13420 return Split;
13421 }
13422
13423 if (CRHS && VT == MVT::i32) {
13424 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13425 // nb = number of trailing zeroes in mask
13426 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13427 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13428 uint64_t Mask = CRHS->getZExtValue();
13429 unsigned Bits = llvm::popcount(Mask);
13430 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13431 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13432 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13433 unsigned Shift = CShift->getZExtValue();
13434 unsigned NB = CRHS->getAPIntValue().countr_zero();
13435 unsigned Offset = NB + Shift;
13436 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13437 SDLoc SL(N);
13438 SDValue BFE =
13439 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13440 DAG.getConstant(Offset, SL, MVT::i32),
13441 DAG.getConstant(Bits, SL, MVT::i32));
13442 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13443 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13444 DAG.getValueType(NarrowVT));
13445 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13446 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13447 return Shl;
13448 }
13449 }
13450 }
13451
13452 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13453 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13454 isa<ConstantSDNode>(LHS.getOperand(2))) {
13455 uint32_t Sel = getConstantPermuteMask(Mask);
13456 if (!Sel)
13457 return SDValue();
13458
13459 // Select 0xc for all zero bytes
13460 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13461 SDLoc DL(N);
13462 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13463 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13464 }
13465 }
13466
13467 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13468 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13469 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13470 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13471 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13472
13473 SDValue X = LHS.getOperand(0);
13474 SDValue Y = RHS.getOperand(0);
13475 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13476 !isTypeLegal(X.getValueType()))
13477 return SDValue();
13478
13479 if (LCC == ISD::SETO) {
13480 if (X != LHS.getOperand(1))
13481 return SDValue();
13482
13483 if (RCC == ISD::SETUNE) {
13484 const ConstantFPSDNode *C1 =
13485 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13486 if (!C1 || !C1->isInfinity() || C1->isNegative())
13487 return SDValue();
13488
13489 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13493
13494 static_assert(
13497 0x3ff) == Mask,
13498 "mask not equal");
13499
13500 SDLoc DL(N);
13501 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13502 DAG.getConstant(Mask, DL, MVT::i32));
13503 }
13504 }
13505 }
13506
13507 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13508 std::swap(LHS, RHS);
13509
13510 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13511 RHS.hasOneUse()) {
13512 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13513 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13514 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13515 // | n_nan)
13516 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13517 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13518 (RHS.getOperand(0) == LHS.getOperand(0) &&
13519 LHS.getOperand(0) == LHS.getOperand(1))) {
13520 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13521 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13522 : Mask->getZExtValue() & OrdMask;
13523
13524 SDLoc DL(N);
13525 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13526 DAG.getConstant(NewMask, DL, MVT::i32));
13527 }
13528 }
13529
13530 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13531 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13532 // and x, (sext cc from i1) => select cc, x, 0
13533 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13534 std::swap(LHS, RHS);
13535 if (isBoolSGPR(RHS.getOperand(0)))
13536 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13537 DAG.getConstant(0, SDLoc(N), MVT::i32));
13538 }
13539
13540 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13541 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13542 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13543 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13544 uint32_t LHSMask = getPermuteMask(LHS);
13545 uint32_t RHSMask = getPermuteMask(RHS);
13546 if (LHSMask != ~0u && RHSMask != ~0u) {
13547 // Canonicalize the expression in an attempt to have fewer unique masks
13548 // and therefore fewer registers used to hold the masks.
13549 if (LHSMask > RHSMask) {
13550 std::swap(LHSMask, RHSMask);
13551 std::swap(LHS, RHS);
13552 }
13553
13554 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13555 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13556 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13557 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13558
13559 // Check of we need to combine values from two sources within a byte.
13560 if (!(LHSUsedLanes & RHSUsedLanes) &&
13561 // If we select high and lower word keep it for SDWA.
13562 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13563 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13564 // Each byte in each mask is either selector mask 0-3, or has higher
13565 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13566 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13567 // mask which is not 0xff wins. By anding both masks we have a correct
13568 // result except that 0x0c shall be corrected to give 0x0c only.
13569 uint32_t Mask = LHSMask & RHSMask;
13570 for (unsigned I = 0; I < 32; I += 8) {
13571 uint32_t ByteSel = 0xff << I;
13572 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13573 Mask &= (0x0c << I) & 0xffffffff;
13574 }
13575
13576 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13577 // or 0x0c.
13578 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13579 SDLoc DL(N);
13580
13581 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13582 RHS.getOperand(0),
13583 DAG.getConstant(Sel, DL, MVT::i32));
13584 }
13585 }
13586 }
13587
13588 return SDValue();
13589}
13590
13591// A key component of v_perm is a mapping between byte position of the src
13592// operands, and the byte position of the dest. To provide such, we need: 1. the
13593// node that provides x byte of the dest of the OR, and 2. the byte of the node
13594// used to provide that x byte. calculateByteProvider finds which node provides
13595// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13596// and finds an ultimate src and byte position For example: The supported
13597// LoadCombine pattern for vector loads is as follows
13598// t1
13599// or
13600// / \
13601// t2 t3
13602// zext shl
13603// | | \
13604// t4 t5 16
13605// or anyext
13606// / \ |
13607// t6 t7 t8
13608// srl shl or
13609// / | / \ / \
13610// t9 t10 t11 t12 t13 t14
13611// trunc* 8 trunc* 8 and and
13612// | | / | | \
13613// t15 t16 t17 t18 t19 t20
13614// trunc* 255 srl -256
13615// | / \
13616// t15 t15 16
13617//
13618// *In this example, the truncs are from i32->i16
13619//
13620// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13621// respectively. calculateSrcByte would find (given node) -> ultimate src &
13622// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13623// After finding the mapping, we can combine the tree into vperm t15, t16,
13624// 0x05000407
13625
13626// Find the source and byte position from a node.
13627// \p DestByte is the byte position of the dest of the or that the src
13628// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13629// dest of the or byte. \p Depth tracks how many recursive iterations we have
13630// performed.
13631static const std::optional<ByteProvider<SDValue>>
13632calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13633 unsigned Depth = 0) {
13634 // We may need to recursively traverse a series of SRLs
13635 if (Depth >= 6)
13636 return std::nullopt;
13637
13638 if (Op.getValueSizeInBits() < 8)
13639 return std::nullopt;
13640
13641 if (Op.getValueType().isVector())
13642 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13643
13644 switch (Op->getOpcode()) {
13645 case ISD::TRUNCATE: {
13646 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13647 }
13648
13649 case ISD::SIGN_EXTEND:
13650 case ISD::ZERO_EXTEND:
13652 SDValue NarrowOp = Op->getOperand(0);
13653 auto NarrowVT = NarrowOp.getValueType();
13654 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13655 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13656 NarrowVT = VTSign->getVT();
13657 }
13658 if (!NarrowVT.isByteSized())
13659 return std::nullopt;
13660 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13661
13662 if (SrcIndex >= NarrowByteWidth)
13663 return std::nullopt;
13664 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13665 }
13666
13667 case ISD::SRA:
13668 case ISD::SRL: {
13669 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13670 if (!ShiftOp)
13671 return std::nullopt;
13672
13673 uint64_t BitShift = ShiftOp->getZExtValue();
13674
13675 if (BitShift % 8 != 0)
13676 return std::nullopt;
13677
13678 SrcIndex += BitShift / 8;
13679
13680 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13681 }
13682
13683 default: {
13684 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13685 }
13686 }
13687 llvm_unreachable("fully handled switch");
13688}
13689
13690// For a byte position in the result of an Or, traverse the tree and find the
13691// node (and the byte of the node) which ultimately provides this {Or,
13692// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13693// the byte position of the Op that corresponds with the originally requested
13694// byte of the Or \p Depth tracks how many recursive iterations we have
13695// performed. \p StartingIndex is the originally requested byte of the Or
13696static const std::optional<ByteProvider<SDValue>>
13697calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13698 unsigned StartingIndex = 0) {
13699 // Finding Src tree of RHS of or typically requires at least 1 additional
13700 // depth
13701 if (Depth > 6)
13702 return std::nullopt;
13703
13704 unsigned BitWidth = Op.getScalarValueSizeInBits();
13705 if (BitWidth % 8 != 0)
13706 return std::nullopt;
13707 if (Index > BitWidth / 8 - 1)
13708 return std::nullopt;
13709
13710 bool IsVec = Op.getValueType().isVector();
13711 switch (Op.getOpcode()) {
13712 case ISD::OR: {
13713 if (IsVec)
13714 return std::nullopt;
13715
13716 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13717 StartingIndex);
13718 if (!RHS)
13719 return std::nullopt;
13720 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13721 StartingIndex);
13722 if (!LHS)
13723 return std::nullopt;
13724 // A well formed Or will have two ByteProviders for each byte, one of which
13725 // is constant zero
13726 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13727 return std::nullopt;
13728 if (!LHS || LHS->isConstantZero())
13729 return RHS;
13730 if (!RHS || RHS->isConstantZero())
13731 return LHS;
13732 return std::nullopt;
13733 }
13734
13735 case ISD::AND: {
13736 if (IsVec)
13737 return std::nullopt;
13738
13739 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13740 if (!BitMaskOp)
13741 return std::nullopt;
13742
13743 uint32_t BitMask = BitMaskOp->getZExtValue();
13744 // Bits we expect for our StartingIndex
13745 uint32_t IndexMask = 0xFF << (Index * 8);
13746
13747 if ((IndexMask & BitMask) != IndexMask) {
13748 // If the result of the and partially provides the byte, then it
13749 // is not well formatted
13750 if (IndexMask & BitMask)
13751 return std::nullopt;
13753 }
13754
13755 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13756 }
13757
13758 case ISD::FSHR: {
13759 if (IsVec)
13760 return std::nullopt;
13761
13762 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13763 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13764 if (!ShiftOp || Op.getValueType().isVector())
13765 return std::nullopt;
13766
13767 uint64_t BitsProvided = Op.getValueSizeInBits();
13768 if (BitsProvided % 8 != 0)
13769 return std::nullopt;
13770
13771 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13772 if (BitShift % 8)
13773 return std::nullopt;
13774
13775 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13776 uint64_t ByteShift = BitShift / 8;
13777
13778 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13779 uint64_t BytesProvided = BitsProvided / 8;
13780 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13781 NewIndex %= BytesProvided;
13782 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13783 }
13784
13785 case ISD::SRA:
13786 case ISD::SRL: {
13787 if (IsVec)
13788 return std::nullopt;
13789
13790 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13791 if (!ShiftOp)
13792 return std::nullopt;
13793
13794 uint64_t BitShift = ShiftOp->getZExtValue();
13795 if (BitShift % 8)
13796 return std::nullopt;
13797
13798 auto BitsProvided = Op.getScalarValueSizeInBits();
13799 if (BitsProvided % 8 != 0)
13800 return std::nullopt;
13801
13802 uint64_t BytesProvided = BitsProvided / 8;
13803 uint64_t ByteShift = BitShift / 8;
13804 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13805 // If the byte we are trying to provide (as tracked by index) falls in this
13806 // range, then the SRL provides the byte. The byte of interest of the src of
13807 // the SRL is Index + ByteShift
13808 return BytesProvided - ByteShift > Index
13809 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13810 Index + ByteShift)
13812 }
13813
13814 case ISD::SHL: {
13815 if (IsVec)
13816 return std::nullopt;
13817
13818 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13819 if (!ShiftOp)
13820 return std::nullopt;
13821
13822 uint64_t BitShift = ShiftOp->getZExtValue();
13823 if (BitShift % 8 != 0)
13824 return std::nullopt;
13825 uint64_t ByteShift = BitShift / 8;
13826
13827 // If we are shifting by an amount greater than (or equal to)
13828 // the index we are trying to provide, then it provides 0s. If not,
13829 // then this bytes are not definitively 0s, and the corresponding byte
13830 // of interest is Index - ByteShift of the src
13831 return Index < ByteShift
13833 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13834 Depth + 1, StartingIndex);
13835 }
13836 case ISD::ANY_EXTEND:
13837 case ISD::SIGN_EXTEND:
13838 case ISD::ZERO_EXTEND:
13840 case ISD::AssertZext:
13841 case ISD::AssertSext: {
13842 if (IsVec)
13843 return std::nullopt;
13844
13845 SDValue NarrowOp = Op->getOperand(0);
13846 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13847 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13848 Op->getOpcode() == ISD::AssertZext ||
13849 Op->getOpcode() == ISD::AssertSext) {
13850 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13851 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13852 }
13853 if (NarrowBitWidth % 8 != 0)
13854 return std::nullopt;
13855 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13856
13857 if (Index >= NarrowByteWidth)
13858 return Op.getOpcode() == ISD::ZERO_EXTEND
13859 ? std::optional<ByteProvider<SDValue>>(
13861 : std::nullopt;
13862 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13863 }
13864
13865 case ISD::TRUNCATE: {
13866 if (IsVec)
13867 return std::nullopt;
13868
13869 uint64_t NarrowByteWidth = BitWidth / 8;
13870
13871 if (NarrowByteWidth >= Index) {
13872 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13873 StartingIndex);
13874 }
13875
13876 return std::nullopt;
13877 }
13878
13879 case ISD::CopyFromReg: {
13880 if (BitWidth / 8 > Index)
13881 return calculateSrcByte(Op, StartingIndex, Index);
13882
13883 return std::nullopt;
13884 }
13885
13886 case ISD::LOAD: {
13887 auto *L = cast<LoadSDNode>(Op.getNode());
13888
13889 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13890 if (NarrowBitWidth % 8 != 0)
13891 return std::nullopt;
13892 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13893
13894 // If the width of the load does not reach byte we are trying to provide for
13895 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13896 // question
13897 if (Index >= NarrowByteWidth) {
13898 return L->getExtensionType() == ISD::ZEXTLOAD
13899 ? std::optional<ByteProvider<SDValue>>(
13901 : std::nullopt;
13902 }
13903
13904 if (NarrowByteWidth > Index) {
13905 return calculateSrcByte(Op, StartingIndex, Index);
13906 }
13907
13908 return std::nullopt;
13909 }
13910
13911 case ISD::BSWAP: {
13912 if (IsVec)
13913 return std::nullopt;
13914
13915 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13916 Depth + 1, StartingIndex);
13917 }
13918
13920 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13921 if (!IdxOp)
13922 return std::nullopt;
13923 auto VecIdx = IdxOp->getZExtValue();
13924 auto ScalarSize = Op.getScalarValueSizeInBits();
13925 if (ScalarSize < 32)
13926 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13927 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13928 StartingIndex, Index);
13929 }
13930
13931 case AMDGPUISD::PERM: {
13932 if (IsVec)
13933 return std::nullopt;
13934
13935 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13936 if (!PermMask)
13937 return std::nullopt;
13938
13939 auto IdxMask =
13940 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13941 if (IdxMask > 0x07 && IdxMask != 0x0c)
13942 return std::nullopt;
13943
13944 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13945 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13946
13947 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13950 }
13951
13952 default: {
13953 return std::nullopt;
13954 }
13955 }
13956
13957 llvm_unreachable("fully handled switch");
13958}
13959
13960// Returns true if the Operand is a scalar and is 16 bits
13961static bool isExtendedFrom16Bits(SDValue &Operand) {
13962
13963 switch (Operand.getOpcode()) {
13964 case ISD::ANY_EXTEND:
13965 case ISD::SIGN_EXTEND:
13966 case ISD::ZERO_EXTEND: {
13967 auto OpVT = Operand.getOperand(0).getValueType();
13968 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13969 }
13970 case ISD::LOAD: {
13971 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13972 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13973 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13974 ExtType == ISD::EXTLOAD) {
13975 auto MemVT = L->getMemoryVT();
13976 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13977 }
13978 return L->getMemoryVT().getSizeInBits() == 16;
13979 }
13980 default:
13981 return false;
13982 }
13983}
13984
13985// Returns true if the mask matches consecutive bytes, and the first byte
13986// begins at a power of 2 byte offset from 0th byte
13987static bool addresses16Bits(int Mask) {
13988 int Low8 = Mask & 0xff;
13989 int Hi8 = (Mask & 0xff00) >> 8;
13990
13991 assert(Low8 < 8 && Hi8 < 8);
13992 // Are the bytes contiguous in the order of increasing addresses.
13993 bool IsConsecutive = (Hi8 - Low8 == 1);
13994 // Is the first byte at location that is aligned for 16 bit instructions.
13995 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13996 // In this case, we still need code to extract the 16 bit operand, so it
13997 // is better to use i8 v_perm
13998 bool Is16Aligned = !(Low8 % 2);
13999
14000 return IsConsecutive && Is16Aligned;
14001}
14002
14003// Do not lower into v_perm if the operands are actually 16 bit
14004// and the selected bits (based on PermMask) correspond with two
14005// easily addressable 16 bit operands.
14007 SDValue &OtherOp) {
14008 int Low16 = PermMask & 0xffff;
14009 int Hi16 = (PermMask & 0xffff0000) >> 16;
14010
14011 auto TempOp = peekThroughBitcasts(Op);
14012 auto TempOtherOp = peekThroughBitcasts(OtherOp);
14013
14014 auto OpIs16Bit =
14015 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
14016 if (!OpIs16Bit)
14017 return true;
14018
14019 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14020 isExtendedFrom16Bits(TempOtherOp);
14021 if (!OtherOpIs16Bit)
14022 return true;
14023
14024 // Do we cleanly address both
14025 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
14026}
14027
14029 unsigned DWordOffset) {
14030 SDValue Ret;
14031
14032 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14033 // ByteProvider must be at least 8 bits
14034 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14035
14036 if (TypeSize <= 32)
14037 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
14038
14039 if (Src.getValueType().isVector()) {
14040 auto ScalarTySize = Src.getScalarValueSizeInBits();
14041 auto ScalarTy = Src.getValueType().getScalarType();
14042 if (ScalarTySize == 32) {
14043 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
14044 DAG.getConstant(DWordOffset, SL, MVT::i32));
14045 }
14046 if (ScalarTySize > 32) {
14047 Ret = DAG.getNode(
14048 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
14049 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14050 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14051 if (ShiftVal)
14052 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
14053 DAG.getConstant(ShiftVal, SL, MVT::i32));
14054 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14055 }
14056
14057 assert(ScalarTySize < 32);
14058 auto NumElements = TypeSize / ScalarTySize;
14059 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14060 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14061 auto NumElementsIn32 = 32 / ScalarTySize;
14062 auto NumAvailElements = DWordOffset < Trunc32Elements
14063 ? NumElementsIn32
14064 : NumElements - NormalizedTrunc;
14065
14067 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
14068 NumAvailElements);
14069
14070 Ret = DAG.getBuildVector(
14071 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
14072 VecSrcs);
14073 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14074 }
14075
14076 /// Scalar Type
14077 auto ShiftVal = 32 * DWordOffset;
14078 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
14079 DAG.getConstant(ShiftVal, SL, MVT::i32));
14080 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14081}
14082
14084 SelectionDAG &DAG = DCI.DAG;
14085 [[maybe_unused]] EVT VT = N->getValueType(0);
14087
14088 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14089 assert(VT == MVT::i32);
14090 for (int i = 0; i < 4; i++) {
14091 // Find the ByteProvider that provides the ith byte of the result of OR
14092 std::optional<ByteProvider<SDValue>> P =
14093 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14094 // TODO support constantZero
14095 if (!P || P->isConstantZero())
14096 return SDValue();
14097
14098 PermNodes.push_back(*P);
14099 }
14100 if (PermNodes.size() != 4)
14101 return SDValue();
14102
14103 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14104 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14105 uint64_t PermMask = 0x00000000;
14106 for (size_t i = 0; i < PermNodes.size(); i++) {
14107 auto PermOp = PermNodes[i];
14108 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14109 // by sizeof(Src2) = 4
14110 int SrcByteAdjust = 4;
14111
14112 // If the Src uses a byte from a different DWORD, then it corresponds
14113 // with a difference source
14114 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14115 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14116 if (SecondSrc)
14117 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14118 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14119 return SDValue();
14120
14121 // Set the index of the second distinct Src node
14122 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14123 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14124 SrcByteAdjust = 0;
14125 }
14126 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14128 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14129 }
14130 SDLoc DL(N);
14131 SDValue Op = *PermNodes[FirstSrc.first].Src;
14132 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14133 assert(Op.getValueSizeInBits() == 32);
14134
14135 // Check that we are not just extracting the bytes in order from an op
14136 if (!SecondSrc) {
14137 int Low16 = PermMask & 0xffff;
14138 int Hi16 = (PermMask & 0xffff0000) >> 16;
14139
14140 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14141 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14142
14143 // The perm op would really just produce Op. So combine into Op
14144 if (WellFormedLow && WellFormedHi)
14145 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14146 }
14147
14148 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14149
14150 if (SecondSrc) {
14151 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14152 assert(OtherOp.getValueSizeInBits() == 32);
14153 }
14154
14155 // Check that we haven't just recreated the same FSHR node.
14156 if (N->getOpcode() == ISD::FSHR &&
14157 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14158 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14159 return SDValue();
14160
14161 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14162
14163 assert(Op.getValueType().isByteSized() &&
14164 OtherOp.getValueType().isByteSized());
14165
14166 // If the ultimate src is less than 32 bits, then we will only be
14167 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14168 // CalculateByteProvider would not have returned Op as source if we
14169 // used a byte that is outside its ValueType. Thus, we are free to
14170 // ANY_EXTEND as the extended bits are dont-cares.
14171 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14172 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14173
14174 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14175 DAG.getConstant(PermMask, DL, MVT::i32));
14176 }
14177 return SDValue();
14178}
14179
14180SDValue SITargetLowering::performOrCombine(SDNode *N,
14181 DAGCombinerInfo &DCI) const {
14182 SelectionDAG &DAG = DCI.DAG;
14183 SDValue LHS = N->getOperand(0);
14184 SDValue RHS = N->getOperand(1);
14185
14186 EVT VT = N->getValueType(0);
14187 if (VT == MVT::i1) {
14188 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14189 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14190 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14191 SDValue Src = LHS.getOperand(0);
14192 if (Src != RHS.getOperand(0))
14193 return SDValue();
14194
14195 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14196 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14197 if (!CLHS || !CRHS)
14198 return SDValue();
14199
14200 // Only 10 bits are used.
14201 static const uint32_t MaxMask = 0x3ff;
14202
14203 uint32_t NewMask =
14204 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14205 SDLoc DL(N);
14206 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14207 DAG.getConstant(NewMask, DL, MVT::i32));
14208 }
14209
14210 return SDValue();
14211 }
14212
14213 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14215 LHS.getOpcode() == AMDGPUISD::PERM &&
14216 isa<ConstantSDNode>(LHS.getOperand(2))) {
14217 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14218 if (!Sel)
14219 return SDValue();
14220
14221 Sel |= LHS.getConstantOperandVal(2);
14222 SDLoc DL(N);
14223 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14224 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14225 }
14226
14227 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14228 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14229 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14230 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14231
14232 // If all the uses of an or need to extract the individual elements, do not
14233 // attempt to lower into v_perm
14234 auto usesCombinedOperand = [](SDNode *OrUse) {
14235 // If we have any non-vectorized use, then it is a candidate for v_perm
14236 if (OrUse->getOpcode() != ISD::BITCAST ||
14237 !OrUse->getValueType(0).isVector())
14238 return true;
14239
14240 // If we have any non-vectorized use, then it is a candidate for v_perm
14241 for (auto *VUser : OrUse->users()) {
14242 if (!VUser->getValueType(0).isVector())
14243 return true;
14244
14245 // If the use of a vector is a store, then combining via a v_perm
14246 // is beneficial.
14247 // TODO -- whitelist more uses
14248 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14249 if (VUser->getOpcode() == VectorwiseOp)
14250 return true;
14251 }
14252 return false;
14253 };
14254
14255 if (!any_of(N->users(), usesCombinedOperand))
14256 return SDValue();
14257
14258 uint32_t LHSMask = getPermuteMask(LHS);
14259 uint32_t RHSMask = getPermuteMask(RHS);
14260
14261 if (LHSMask != ~0u && RHSMask != ~0u) {
14262 // Canonicalize the expression in an attempt to have fewer unique masks
14263 // and therefore fewer registers used to hold the masks.
14264 if (LHSMask > RHSMask) {
14265 std::swap(LHSMask, RHSMask);
14266 std::swap(LHS, RHS);
14267 }
14268
14269 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14270 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14271 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14272 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14273
14274 // Check of we need to combine values from two sources within a byte.
14275 if (!(LHSUsedLanes & RHSUsedLanes) &&
14276 // If we select high and lower word keep it for SDWA.
14277 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14278 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14279 // Kill zero bytes selected by other mask. Zero value is 0xc.
14280 LHSMask &= ~RHSUsedLanes;
14281 RHSMask &= ~LHSUsedLanes;
14282 // Add 4 to each active LHS lane
14283 LHSMask |= LHSUsedLanes & 0x04040404;
14284 // Combine masks
14285 uint32_t Sel = LHSMask | RHSMask;
14286 SDLoc DL(N);
14287
14288 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14289 RHS.getOperand(0),
14290 DAG.getConstant(Sel, DL, MVT::i32));
14291 }
14292 }
14293 if (LHSMask == ~0u || RHSMask == ~0u) {
14294 if (SDValue Perm = matchPERM(N, DCI))
14295 return Perm;
14296 }
14297 }
14298
14299 // Detect identity v2i32 OR and replace with identity source node.
14300 // Specifically an Or that has operands constructed from the same source node
14301 // via extract_vector_elt and build_vector. I.E.
14302 // v2i32 or(
14303 // v2i32 build_vector(
14304 // i32 extract_elt(%IdentitySrc, 0),
14305 // i32 0
14306 // ),
14307 // v2i32 build_vector(
14308 // i32 0,
14309 // i32 extract_elt(%IdentitySrc, 1)
14310 // ) )
14311 // =>
14312 // v2i32 %IdentitySrc
14313
14314 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14315 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14316
14317 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14318 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14319
14320 // Test for and normalise build vectors.
14321 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14322
14323 // Get the extract_vector_element operands.
14324 SDValue LEVE = LHS->getOperand(0);
14325 SDValue REVE = RHS->getOperand(1);
14326
14327 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14329 // Check that different elements from the same vector are
14330 // extracted.
14331 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14332 LEVE->getOperand(1) != REVE->getOperand(1)) {
14333 SDValue IdentitySrc = LEVE.getOperand(0);
14334 return IdentitySrc;
14335 }
14336 }
14337 }
14338 }
14339
14340 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14341 return SDValue();
14342
14343 // TODO: This could be a generic combine with a predicate for extracting the
14344 // high half of an integer being free.
14345
14346 // (or i64:x, (zero_extend i32:y)) ->
14347 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14348 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14349 RHS.getOpcode() != ISD::ZERO_EXTEND)
14350 std::swap(LHS, RHS);
14351
14352 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14353 SDValue ExtSrc = RHS.getOperand(0);
14354 EVT SrcVT = ExtSrc.getValueType();
14355 if (SrcVT == MVT::i32) {
14356 SDLoc SL(N);
14357 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14358 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14359
14360 DCI.AddToWorklist(LowOr.getNode());
14361 DCI.AddToWorklist(HiBits.getNode());
14362
14363 SDValue Vec =
14364 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14365 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14366 }
14367 }
14368
14369 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14370 if (CRHS) {
14371 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14372 N->getOperand(0), CRHS))
14373 return Split;
14374 }
14375
14376 return SDValue();
14377}
14378
14379SDValue SITargetLowering::performXorCombine(SDNode *N,
14380 DAGCombinerInfo &DCI) const {
14381 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14382 return RV;
14383
14384 SDValue LHS = N->getOperand(0);
14385 SDValue RHS = N->getOperand(1);
14386
14387 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14388 SelectionDAG &DAG = DCI.DAG;
14389
14390 EVT VT = N->getValueType(0);
14391 if (CRHS && VT == MVT::i64) {
14392 if (SDValue Split =
14393 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14394 return Split;
14395 }
14396
14397 // v2i32 (xor (vselect cc, x, y), K) ->
14398 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14399 // replaced with source modifiers when the select is lowered to CNDMASK.
14400 unsigned Opc = LHS.getOpcode();
14401 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14402 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14403 CRHS && CRHS->getAPIntValue().isSignMask()) {
14404 SDValue CC = LHS->getOperand(0);
14405 SDValue TRUE = LHS->getOperand(1);
14406 SDValue FALSE = LHS->getOperand(2);
14407 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14408 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14409 SDValue XSelect =
14410 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14411 return XSelect;
14412 }
14413
14414 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14415 // fneg-like xors into 64-bit select.
14416 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14417 // This looks like an fneg, try to fold as a source modifier.
14418 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14420 // xor (select c, a, b), 0x80000000 ->
14421 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14422 SDLoc DL(N);
14423 SDValue CastLHS =
14424 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14425 SDValue CastRHS =
14426 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14427 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14428 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14429 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14430 LHS->getOperand(0), FNegLHS, FNegRHS);
14431 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14432 }
14433 }
14434
14435 return SDValue();
14436}
14437
14438SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14439 DAGCombinerInfo &DCI) const {
14440 if (!Subtarget->has16BitInsts() ||
14441 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14442 return SDValue();
14443
14444 EVT VT = N->getValueType(0);
14445 if (VT != MVT::i32)
14446 return SDValue();
14447
14448 SDValue Src = N->getOperand(0);
14449 if (Src.getValueType() != MVT::i16)
14450 return SDValue();
14451
14452 return SDValue();
14453}
14454
14455SDValue
14456SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14457 DAGCombinerInfo &DCI) const {
14458 SDValue Src = N->getOperand(0);
14459 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14460
14461 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14462 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14463 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14464 VTSign->getVT() == MVT::i8) ||
14465 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14466 VTSign->getVT() == MVT::i16))) {
14467 assert(Subtarget->hasScalarSubwordLoads() &&
14468 "s_buffer_load_{u8, i8} are supported "
14469 "in GFX12 (or newer) architectures.");
14470 EVT VT = Src.getValueType();
14471 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14472 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14473 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14474 SDLoc DL(N);
14475 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14476 SDValue Ops[] = {
14477 Src.getOperand(0), // source register
14478 Src.getOperand(1), // offset
14479 Src.getOperand(2) // cachePolicy
14480 };
14481 auto *M = cast<MemSDNode>(Src);
14482 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14483 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14484 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14485 return LoadVal;
14486 }
14487 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14488 VTSign->getVT() == MVT::i8) ||
14489 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14490 VTSign->getVT() == MVT::i16)) &&
14491 Src.hasOneUse()) {
14492 auto *M = cast<MemSDNode>(Src);
14493 SDValue Ops[] = {Src.getOperand(0), // Chain
14494 Src.getOperand(1), // rsrc
14495 Src.getOperand(2), // vindex
14496 Src.getOperand(3), // voffset
14497 Src.getOperand(4), // soffset
14498 Src.getOperand(5), // offset
14499 Src.getOperand(6), Src.getOperand(7)};
14500 // replace with BUFFER_LOAD_BYTE/SHORT
14501 SDVTList ResList =
14502 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14503 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14504 ? AMDGPUISD::BUFFER_LOAD_BYTE
14505 : AMDGPUISD::BUFFER_LOAD_SHORT;
14506 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14507 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14508 return DCI.DAG.getMergeValues(
14509 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14510 }
14511 return SDValue();
14512}
14513
14514SDValue SITargetLowering::performClassCombine(SDNode *N,
14515 DAGCombinerInfo &DCI) const {
14516 SelectionDAG &DAG = DCI.DAG;
14517 SDValue Mask = N->getOperand(1);
14518
14519 // fp_class x, 0 -> false
14520 if (isNullConstant(Mask))
14521 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14522
14523 if (N->getOperand(0).isUndef())
14524 return DAG.getUNDEF(MVT::i1);
14525
14526 return SDValue();
14527}
14528
14529SDValue SITargetLowering::performRcpCombine(SDNode *N,
14530 DAGCombinerInfo &DCI) const {
14531 EVT VT = N->getValueType(0);
14532 SDValue N0 = N->getOperand(0);
14533
14534 if (N0.isUndef()) {
14535 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14536 SDLoc(N), VT);
14537 }
14538
14539 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14540 N0.getOpcode() == ISD::SINT_TO_FP)) {
14541 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14542 N->getFlags());
14543 }
14544
14545 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14546 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14547 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14548 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14549 N->getFlags());
14550 }
14551
14553}
14554
14556 unsigned MaxDepth) const {
14557 unsigned Opcode = Op.getOpcode();
14558 if (Opcode == ISD::FCANONICALIZE)
14559 return true;
14560
14561 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14562 const auto &F = CFP->getValueAPF();
14563 if (F.isNaN() && F.isSignaling())
14564 return false;
14565 if (!F.isDenormal())
14566 return true;
14567
14568 DenormalMode Mode =
14569 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14570 return Mode == DenormalMode::getIEEE();
14571 }
14572
14573 // If source is a result of another standard FP operation it is already in
14574 // canonical form.
14575 if (MaxDepth == 0)
14576 return false;
14577
14578 switch (Opcode) {
14579 // These will flush denorms if required.
14580 case ISD::FADD:
14581 case ISD::FSUB:
14582 case ISD::FMUL:
14583 case ISD::FCEIL:
14584 case ISD::FFLOOR:
14585 case ISD::FMA:
14586 case ISD::FMAD:
14587 case ISD::FSQRT:
14588 case ISD::FDIV:
14589 case ISD::FREM:
14590 case ISD::FP_ROUND:
14591 case ISD::FP_EXTEND:
14592 case ISD::FP16_TO_FP:
14593 case ISD::FP_TO_FP16:
14594 case ISD::BF16_TO_FP:
14595 case ISD::FP_TO_BF16:
14596 case ISD::FLDEXP:
14597 case AMDGPUISD::FMUL_LEGACY:
14598 case AMDGPUISD::FMAD_FTZ:
14599 case AMDGPUISD::RCP:
14600 case AMDGPUISD::RSQ:
14601 case AMDGPUISD::RSQ_CLAMP:
14602 case AMDGPUISD::RCP_LEGACY:
14603 case AMDGPUISD::RCP_IFLAG:
14604 case AMDGPUISD::LOG:
14605 case AMDGPUISD::EXP:
14606 case AMDGPUISD::DIV_SCALE:
14607 case AMDGPUISD::DIV_FMAS:
14608 case AMDGPUISD::DIV_FIXUP:
14609 case AMDGPUISD::FRACT:
14610 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14611 case AMDGPUISD::CVT_F32_UBYTE0:
14612 case AMDGPUISD::CVT_F32_UBYTE1:
14613 case AMDGPUISD::CVT_F32_UBYTE2:
14614 case AMDGPUISD::CVT_F32_UBYTE3:
14615 case AMDGPUISD::FP_TO_FP16:
14616 case AMDGPUISD::SIN_HW:
14617 case AMDGPUISD::COS_HW:
14618 return true;
14619
14620 // It can/will be lowered or combined as a bit operation.
14621 // Need to check their input recursively to handle.
14622 case ISD::FNEG:
14623 case ISD::FABS:
14624 case ISD::FCOPYSIGN:
14625 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14626
14627 case ISD::AND:
14628 if (Op.getValueType() == MVT::i32) {
14629 // Be careful as we only know it is a bitcast floating point type. It
14630 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14631 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14632 // is valid to optimize for all types.
14633 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14634 if (RHS->getZExtValue() == 0xffff0000) {
14635 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14636 }
14637 }
14638 }
14639 break;
14640
14641 case ISD::FSIN:
14642 case ISD::FCOS:
14643 case ISD::FSINCOS:
14644 return Op.getValueType().getScalarType() != MVT::f16;
14645
14646 case ISD::FMINNUM:
14647 case ISD::FMAXNUM:
14648 case ISD::FMINNUM_IEEE:
14649 case ISD::FMAXNUM_IEEE:
14650 case ISD::FMINIMUM:
14651 case ISD::FMAXIMUM:
14652 case ISD::FMINIMUMNUM:
14653 case ISD::FMAXIMUMNUM:
14654 case AMDGPUISD::CLAMP:
14655 case AMDGPUISD::FMED3:
14656 case AMDGPUISD::FMAX3:
14657 case AMDGPUISD::FMIN3:
14658 case AMDGPUISD::FMAXIMUM3:
14659 case AMDGPUISD::FMINIMUM3: {
14660 // FIXME: Shouldn't treat the generic operations different based these.
14661 // However, we aren't really required to flush the result from
14662 // minnum/maxnum..
14663
14664 // snans will be quieted, so we only need to worry about denormals.
14665 if (Subtarget->supportsMinMaxDenormModes() ||
14666 // FIXME: denormalsEnabledForType is broken for dynamic
14667 denormalsEnabledForType(DAG, Op.getValueType()))
14668 return true;
14669
14670 // Flushing may be required.
14671 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14672 // targets need to check their input recursively.
14673
14674 // FIXME: Does this apply with clamp? It's implemented with max.
14675 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14676 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14677 return false;
14678 }
14679
14680 return true;
14681 }
14682 case ISD::SELECT: {
14683 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14684 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14685 }
14686 case ISD::BUILD_VECTOR: {
14687 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14688 SDValue SrcOp = Op.getOperand(i);
14689 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14690 return false;
14691 }
14692
14693 return true;
14694 }
14697 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14698 }
14700 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14701 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14702 }
14703 case ISD::UNDEF:
14704 // Could be anything.
14705 return false;
14706
14707 case ISD::BITCAST:
14708 // TODO: This is incorrect as it loses track of the operand's type. We may
14709 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14710 // same bits that are canonicalized in one type need not be in the other.
14711 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14712 case ISD::TRUNCATE: {
14713 // Hack round the mess we make when legalizing extract_vector_elt
14714 if (Op.getValueType() == MVT::i16) {
14715 SDValue TruncSrc = Op.getOperand(0);
14716 if (TruncSrc.getValueType() == MVT::i32 &&
14717 TruncSrc.getOpcode() == ISD::BITCAST &&
14718 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14719 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14720 }
14721 }
14722 return false;
14723 }
14725 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14726 // TODO: Handle more intrinsics
14727 switch (IntrinsicID) {
14728 case Intrinsic::amdgcn_cvt_pkrtz:
14729 case Intrinsic::amdgcn_cubeid:
14730 case Intrinsic::amdgcn_frexp_mant:
14731 case Intrinsic::amdgcn_fdot2:
14732 case Intrinsic::amdgcn_rcp:
14733 case Intrinsic::amdgcn_rsq:
14734 case Intrinsic::amdgcn_rsq_clamp:
14735 case Intrinsic::amdgcn_rcp_legacy:
14736 case Intrinsic::amdgcn_rsq_legacy:
14737 case Intrinsic::amdgcn_trig_preop:
14738 case Intrinsic::amdgcn_tanh:
14739 case Intrinsic::amdgcn_log:
14740 case Intrinsic::amdgcn_exp2:
14741 case Intrinsic::amdgcn_sqrt:
14742 return true;
14743 default:
14744 break;
14745 }
14746
14747 break;
14748 }
14749 default:
14750 break;
14751 }
14752
14753 // FIXME: denormalsEnabledForType is broken for dynamic
14754 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14755 DAG.isKnownNeverSNaN(Op);
14756}
14757
14759 unsigned MaxDepth) const {
14760 const MachineRegisterInfo &MRI = MF.getRegInfo();
14761 MachineInstr *MI = MRI.getVRegDef(Reg);
14762 unsigned Opcode = MI->getOpcode();
14763
14764 if (Opcode == AMDGPU::G_FCANONICALIZE)
14765 return true;
14766
14767 std::optional<FPValueAndVReg> FCR;
14768 // Constant splat (can be padded with undef) or scalar constant.
14770 if (FCR->Value.isSignaling())
14771 return false;
14772 if (!FCR->Value.isDenormal())
14773 return true;
14774
14775 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14776 return Mode == DenormalMode::getIEEE();
14777 }
14778
14779 if (MaxDepth == 0)
14780 return false;
14781
14782 switch (Opcode) {
14783 case AMDGPU::G_FADD:
14784 case AMDGPU::G_FSUB:
14785 case AMDGPU::G_FMUL:
14786 case AMDGPU::G_FCEIL:
14787 case AMDGPU::G_FFLOOR:
14788 case AMDGPU::G_FRINT:
14789 case AMDGPU::G_FNEARBYINT:
14790 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14791 case AMDGPU::G_INTRINSIC_TRUNC:
14792 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14793 case AMDGPU::G_FMA:
14794 case AMDGPU::G_FMAD:
14795 case AMDGPU::G_FSQRT:
14796 case AMDGPU::G_FDIV:
14797 case AMDGPU::G_FREM:
14798 case AMDGPU::G_FPOW:
14799 case AMDGPU::G_FPEXT:
14800 case AMDGPU::G_FLOG:
14801 case AMDGPU::G_FLOG2:
14802 case AMDGPU::G_FLOG10:
14803 case AMDGPU::G_FPTRUNC:
14804 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14805 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14806 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14807 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14808 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14809 return true;
14810 case AMDGPU::G_FNEG:
14811 case AMDGPU::G_FABS:
14812 case AMDGPU::G_FCOPYSIGN:
14813 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14814 case AMDGPU::G_FMINNUM:
14815 case AMDGPU::G_FMAXNUM:
14816 case AMDGPU::G_FMINNUM_IEEE:
14817 case AMDGPU::G_FMAXNUM_IEEE:
14818 case AMDGPU::G_FMINIMUM:
14819 case AMDGPU::G_FMAXIMUM:
14820 case AMDGPU::G_FMINIMUMNUM:
14821 case AMDGPU::G_FMAXIMUMNUM: {
14822 if (Subtarget->supportsMinMaxDenormModes() ||
14823 // FIXME: denormalsEnabledForType is broken for dynamic
14824 denormalsEnabledForType(MRI.getType(Reg), MF))
14825 return true;
14826
14827 [[fallthrough]];
14828 }
14829 case AMDGPU::G_BUILD_VECTOR:
14830 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14831 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14832 return false;
14833 return true;
14834 case AMDGPU::G_INTRINSIC:
14835 case AMDGPU::G_INTRINSIC_CONVERGENT:
14836 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14837 case Intrinsic::amdgcn_fmul_legacy:
14838 case Intrinsic::amdgcn_fmad_ftz:
14839 case Intrinsic::amdgcn_sqrt:
14840 case Intrinsic::amdgcn_fmed3:
14841 case Intrinsic::amdgcn_sin:
14842 case Intrinsic::amdgcn_cos:
14843 case Intrinsic::amdgcn_log:
14844 case Intrinsic::amdgcn_exp2:
14845 case Intrinsic::amdgcn_log_clamp:
14846 case Intrinsic::amdgcn_rcp:
14847 case Intrinsic::amdgcn_rcp_legacy:
14848 case Intrinsic::amdgcn_rsq:
14849 case Intrinsic::amdgcn_rsq_clamp:
14850 case Intrinsic::amdgcn_rsq_legacy:
14851 case Intrinsic::amdgcn_div_scale:
14852 case Intrinsic::amdgcn_div_fmas:
14853 case Intrinsic::amdgcn_div_fixup:
14854 case Intrinsic::amdgcn_fract:
14855 case Intrinsic::amdgcn_cvt_pkrtz:
14856 case Intrinsic::amdgcn_cubeid:
14857 case Intrinsic::amdgcn_cubema:
14858 case Intrinsic::amdgcn_cubesc:
14859 case Intrinsic::amdgcn_cubetc:
14860 case Intrinsic::amdgcn_frexp_mant:
14861 case Intrinsic::amdgcn_fdot2:
14862 case Intrinsic::amdgcn_trig_preop:
14863 case Intrinsic::amdgcn_tanh:
14864 return true;
14865 default:
14866 break;
14867 }
14868
14869 [[fallthrough]];
14870 default:
14871 return false;
14872 }
14873
14874 llvm_unreachable("invalid operation");
14875}
14876
14877// Constant fold canonicalize.
14878SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14879 const SDLoc &SL, EVT VT,
14880 const APFloat &C) const {
14881 // Flush denormals to 0 if not enabled.
14882 if (C.isDenormal()) {
14883 DenormalMode Mode =
14884 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14885 if (Mode == DenormalMode::getPreserveSign()) {
14886 return DAG.getConstantFP(
14887 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14888 }
14889
14890 if (Mode != DenormalMode::getIEEE())
14891 return SDValue();
14892 }
14893
14894 if (C.isNaN()) {
14895 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14896 if (C.isSignaling()) {
14897 // Quiet a signaling NaN.
14898 // FIXME: Is this supposed to preserve payload bits?
14899 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14900 }
14901
14902 // Make sure it is the canonical NaN bitpattern.
14903 //
14904 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14905 // immediate?
14906 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14907 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14908 }
14909
14910 // Already canonical.
14911 return DAG.getConstantFP(C, SL, VT);
14912}
14913
14915 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14916}
14917
14918SDValue
14919SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14920 DAGCombinerInfo &DCI) const {
14921 SelectionDAG &DAG = DCI.DAG;
14922 SDValue N0 = N->getOperand(0);
14923 EVT VT = N->getValueType(0);
14924
14925 // fcanonicalize undef -> qnan
14926 if (N0.isUndef()) {
14928 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14929 }
14930
14931 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14932 EVT VT = N->getValueType(0);
14933 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14934 }
14935
14936 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14937 // (fcanonicalize k)
14938 //
14939 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14940
14941 // TODO: This could be better with wider vectors that will be split to v2f16,
14942 // and to consider uses since there aren't that many packed operations.
14943 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14944 isTypeLegal(MVT::v2f16)) {
14945 SDLoc SL(N);
14946 SDValue NewElts[2];
14947 SDValue Lo = N0.getOperand(0);
14948 SDValue Hi = N0.getOperand(1);
14949 EVT EltVT = Lo.getValueType();
14950
14952 for (unsigned I = 0; I != 2; ++I) {
14953 SDValue Op = N0.getOperand(I);
14954 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14955 NewElts[I] =
14956 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14957 } else if (Op.isUndef()) {
14958 // Handled below based on what the other operand is.
14959 NewElts[I] = Op;
14960 } else {
14961 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14962 }
14963 }
14964
14965 // If one half is undef, and one is constant, prefer a splat vector rather
14966 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14967 // cheaper to use and may be free with a packed operation.
14968 if (NewElts[0].isUndef()) {
14969 if (isa<ConstantFPSDNode>(NewElts[1]))
14970 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14971 ? NewElts[1]
14972 : DAG.getConstantFP(0.0f, SL, EltVT);
14973 }
14974
14975 if (NewElts[1].isUndef()) {
14976 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14977 ? NewElts[0]
14978 : DAG.getConstantFP(0.0f, SL, EltVT);
14979 }
14980
14981 return DAG.getBuildVector(VT, SL, NewElts);
14982 }
14983 }
14984
14985 return SDValue();
14986}
14987
14988static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14989 switch (Opc) {
14990 case ISD::FMAXNUM:
14991 case ISD::FMAXNUM_IEEE:
14992 case ISD::FMAXIMUMNUM:
14993 return AMDGPUISD::FMAX3;
14994 case ISD::FMAXIMUM:
14995 return AMDGPUISD::FMAXIMUM3;
14996 case ISD::SMAX:
14997 return AMDGPUISD::SMAX3;
14998 case ISD::UMAX:
14999 return AMDGPUISD::UMAX3;
15000 case ISD::FMINNUM:
15001 case ISD::FMINNUM_IEEE:
15002 case ISD::FMINIMUMNUM:
15003 return AMDGPUISD::FMIN3;
15004 case ISD::FMINIMUM:
15005 return AMDGPUISD::FMINIMUM3;
15006 case ISD::SMIN:
15007 return AMDGPUISD::SMIN3;
15008 case ISD::UMIN:
15009 return AMDGPUISD::UMIN3;
15010 default:
15011 llvm_unreachable("Not a min/max opcode");
15012 }
15013}
15014
15015SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15016 const SDLoc &SL, SDValue Src,
15017 SDValue MinVal,
15018 SDValue MaxVal,
15019 bool Signed) const {
15020
15021 // med3 comes from
15022 // min(max(x, K0), K1), K0 < K1
15023 // max(min(x, K0), K1), K1 < K0
15024 //
15025 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15026 // min/max op.
15027 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
15028 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
15029
15030 if (!MinK || !MaxK)
15031 return SDValue();
15032
15033 if (Signed) {
15034 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
15035 return SDValue();
15036 } else {
15037 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
15038 return SDValue();
15039 }
15040
15041 EVT VT = MinK->getValueType(0);
15042 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15043 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15044 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15045
15046 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15047 // not available, but this is unlikely to be profitable as constants
15048 // will often need to be materialized & extended, especially on
15049 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15050 return SDValue();
15051}
15052
15055 return C;
15056
15058 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15059 return C;
15060 }
15061
15062 return nullptr;
15063}
15064
15065SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15066 const SDLoc &SL, SDValue Op0,
15067 SDValue Op1) const {
15068 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
15069 if (!K1)
15070 return SDValue();
15071
15072 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
15073 if (!K0)
15074 return SDValue();
15075
15076 // Ordered >= (although NaN inputs should have folded away by now).
15077 if (K0->getValueAPF() > K1->getValueAPF())
15078 return SDValue();
15079
15080 // med3 with a nan input acts like
15081 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15082 //
15083 // So the result depends on whether the IEEE mode bit is enabled or not with a
15084 // signaling nan input.
15085 // ieee=1
15086 // s0 snan: yields s2
15087 // s1 snan: yields s2
15088 // s2 snan: qnan
15089
15090 // s0 qnan: min(s1, s2)
15091 // s1 qnan: min(s0, s2)
15092 // s2 qnan: min(s0, s1)
15093
15094 // ieee=0
15095 // s0 snan: min(s1, s2)
15096 // s1 snan: min(s0, s2)
15097 // s2 snan: qnan
15098
15099 // s0 qnan: min(s1, s2)
15100 // s1 qnan: min(s0, s2)
15101 // s2 qnan: min(s0, s1)
15102 const MachineFunction &MF = DAG.getMachineFunction();
15103 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15104
15105 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15106 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15107 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15108 EVT VT = Op0.getValueType();
15109 if (Info->getMode().DX10Clamp) {
15110 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15111 // hardware fmed3 behavior converting to a min.
15112 // FIXME: Should this be allowing -0.0?
15113 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15114 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15115 }
15116
15117 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15118 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15119 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15120 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15121 // then give the other result, which is different from med3 with a NaN
15122 // input.
15123 SDValue Var = Op0.getOperand(0);
15124 if (!DAG.isKnownNeverSNaN(Var))
15125 return SDValue();
15126
15127 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15128
15129 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15130 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15131 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15132 SDValue(K0, 0), SDValue(K1, 0));
15133 }
15134 }
15135
15136 return SDValue();
15137}
15138
15139/// \return true if the subtarget supports minimum3 and maximum3 with the given
15140/// base min/max opcode \p Opc for type \p VT.
15141static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15142 EVT VT) {
15143 switch (Opc) {
15144 case ISD::FMINNUM:
15145 case ISD::FMAXNUM:
15146 case ISD::FMINNUM_IEEE:
15147 case ISD::FMAXNUM_IEEE:
15148 case ISD::FMINIMUMNUM:
15149 case ISD::FMAXIMUMNUM:
15150 case AMDGPUISD::FMIN_LEGACY:
15151 case AMDGPUISD::FMAX_LEGACY:
15152 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15153 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15154 case ISD::FMINIMUM:
15155 case ISD::FMAXIMUM:
15156 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15157 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15158 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15159 case ISD::SMAX:
15160 case ISD::SMIN:
15161 case ISD::UMAX:
15162 case ISD::UMIN:
15163 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15164 default:
15165 return false;
15166 }
15167
15168 llvm_unreachable("not a min/max opcode");
15169}
15170
15171SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15172 DAGCombinerInfo &DCI) const {
15173 SelectionDAG &DAG = DCI.DAG;
15174
15175 EVT VT = N->getValueType(0);
15176 unsigned Opc = N->getOpcode();
15177 SDValue Op0 = N->getOperand(0);
15178 SDValue Op1 = N->getOperand(1);
15179
15180 // Only do this if the inner op has one use since this will just increases
15181 // register pressure for no benefit.
15182
15183 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15184 // max(max(a, b), c) -> max3(a, b, c)
15185 // min(min(a, b), c) -> min3(a, b, c)
15186 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15187 SDLoc DL(N);
15188 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15189 Op0.getOperand(0), Op0.getOperand(1), Op1);
15190 }
15191
15192 // Try commuted.
15193 // max(a, max(b, c)) -> max3(a, b, c)
15194 // min(a, min(b, c)) -> min3(a, b, c)
15195 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15196 SDLoc DL(N);
15197 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15198 Op0, Op1.getOperand(0), Op1.getOperand(1));
15199 }
15200 }
15201
15202 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15203 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15204 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15205 if (SDValue Med3 = performIntMed3ImmCombine(
15206 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15207 return Med3;
15208 }
15209 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15210 if (SDValue Med3 = performIntMed3ImmCombine(
15211 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15212 return Med3;
15213 }
15214
15215 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15216 if (SDValue Med3 = performIntMed3ImmCombine(
15217 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15218 return Med3;
15219 }
15220 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15221 if (SDValue Med3 = performIntMed3ImmCombine(
15222 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15223 return Med3;
15224 }
15225
15226 // if !is_snan(x):
15227 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15228 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15229 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15230 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15231 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15234 (Opc == AMDGPUISD::FMIN_LEGACY &&
15235 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15236 (VT == MVT::f32 || VT == MVT::f64 ||
15237 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15238 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15239 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15240 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15241 Op0.hasOneUse()) {
15242 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15243 return Res;
15244 }
15245
15246 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15247 // for some types, but at a higher cost since it's implemented with a 3
15248 // operand form.
15249 const SDNodeFlags Flags = N->getFlags();
15250 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
15251 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
15252 unsigned NewOpc =
15254 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15255 }
15256
15257 return SDValue();
15258}
15259
15263 // FIXME: Should this be allowing -0.0?
15264 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15265 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15266 }
15267 }
15268
15269 return false;
15270}
15271
15272// FIXME: Should only worry about snans for version with chain.
15273SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15274 DAGCombinerInfo &DCI) const {
15275 EVT VT = N->getValueType(0);
15276 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15277 // NaNs. With a NaN input, the order of the operands may change the result.
15278
15279 SelectionDAG &DAG = DCI.DAG;
15280 SDLoc SL(N);
15281
15282 SDValue Src0 = N->getOperand(0);
15283 SDValue Src1 = N->getOperand(1);
15284 SDValue Src2 = N->getOperand(2);
15285
15286 if (isClampZeroToOne(Src0, Src1)) {
15287 // const_a, const_b, x -> clamp is safe in all cases including signaling
15288 // nans.
15289 // FIXME: Should this be allowing -0.0?
15290 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15291 }
15292
15293 const MachineFunction &MF = DAG.getMachineFunction();
15294 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15295
15296 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15297 // handling no dx10-clamp?
15298 if (Info->getMode().DX10Clamp) {
15299 // If NaNs is clamped to 0, we are free to reorder the inputs.
15300
15301 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15302 std::swap(Src0, Src1);
15303
15304 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15305 std::swap(Src1, Src2);
15306
15307 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15308 std::swap(Src0, Src1);
15309
15310 if (isClampZeroToOne(Src1, Src2))
15311 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15312 }
15313
15314 return SDValue();
15315}
15316
15317SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15318 DAGCombinerInfo &DCI) const {
15319 SDValue Src0 = N->getOperand(0);
15320 SDValue Src1 = N->getOperand(1);
15321 if (Src0.isUndef() && Src1.isUndef())
15322 return DCI.DAG.getUNDEF(N->getValueType(0));
15323 return SDValue();
15324}
15325
15326// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15327// expanded into a set of cmp/select instructions.
15329 unsigned NumElem,
15330 bool IsDivergentIdx,
15331 const GCNSubtarget *Subtarget) {
15333 return false;
15334
15335 unsigned VecSize = EltSize * NumElem;
15336
15337 // Sub-dword vectors of size 2 dword or less have better implementation.
15338 if (VecSize <= 64 && EltSize < 32)
15339 return false;
15340
15341 // Always expand the rest of sub-dword instructions, otherwise it will be
15342 // lowered via memory.
15343 if (EltSize < 32)
15344 return true;
15345
15346 // Always do this if var-idx is divergent, otherwise it will become a loop.
15347 if (IsDivergentIdx)
15348 return true;
15349
15350 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15351 unsigned NumInsts = NumElem /* Number of compares */ +
15352 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15353
15354 // On some architectures (GFX9) movrel is not available and it's better
15355 // to expand.
15356 if (Subtarget->useVGPRIndexMode())
15357 return NumInsts <= 16;
15358
15359 // If movrel is available, use it instead of expanding for vector of 8
15360 // elements.
15361 if (Subtarget->hasMovrel())
15362 return NumInsts <= 15;
15363
15364 return true;
15365}
15366
15368 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15369 if (isa<ConstantSDNode>(Idx))
15370 return false;
15371
15372 SDValue Vec = N->getOperand(0);
15373 EVT VecVT = Vec.getValueType();
15374 EVT EltVT = VecVT.getVectorElementType();
15375 unsigned EltSize = EltVT.getSizeInBits();
15376 unsigned NumElem = VecVT.getVectorNumElements();
15377
15379 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15380}
15381
15382SDValue
15383SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15384 DAGCombinerInfo &DCI) const {
15385 SDValue Vec = N->getOperand(0);
15386 SelectionDAG &DAG = DCI.DAG;
15387
15388 EVT VecVT = Vec.getValueType();
15389 EVT VecEltVT = VecVT.getVectorElementType();
15390 EVT ResVT = N->getValueType(0);
15391
15392 unsigned VecSize = VecVT.getSizeInBits();
15393 unsigned VecEltSize = VecEltVT.getSizeInBits();
15394
15395 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15397 SDLoc SL(N);
15398 SDValue Idx = N->getOperand(1);
15399 SDValue Elt =
15400 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15401 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15402 }
15403
15404 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15405 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15406 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15407 // depending on the shift operand. See e.g. performSraCombine().
15408 // This combine ensures that the optimisation is compatible with v2i32
15409 // legalised AND.
15410 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15411 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15412
15414 if (!C || C->getZExtValue() != 0x1f)
15415 return SDValue();
15416
15417 SDLoc SL(N);
15418 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15419 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15420 Vec->getOperand(0), N->getOperand(1));
15421 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15422 DAG.ReplaceAllUsesWith(N, A.getNode());
15423 }
15424
15425 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15426 // =>
15427 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15428 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15429 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15430 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15431 SDLoc SL(N);
15432 SDValue Idx = N->getOperand(1);
15433 unsigned Opc = Vec.getOpcode();
15434
15435 switch (Opc) {
15436 default:
15437 break;
15438 // TODO: Support other binary operations.
15439 case ISD::FADD:
15440 case ISD::FSUB:
15441 case ISD::FMUL:
15442 case ISD::ADD:
15443 case ISD::UMIN:
15444 case ISD::UMAX:
15445 case ISD::SMIN:
15446 case ISD::SMAX:
15447 case ISD::FMAXNUM:
15448 case ISD::FMINNUM:
15449 case ISD::FMAXNUM_IEEE:
15450 case ISD::FMINNUM_IEEE:
15451 case ISD::FMAXIMUM:
15452 case ISD::FMINIMUM: {
15453 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15454 Vec.getOperand(0), Idx);
15455 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15456 Vec.getOperand(1), Idx);
15457
15458 DCI.AddToWorklist(Elt0.getNode());
15459 DCI.AddToWorklist(Elt1.getNode());
15460 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15461 }
15462 }
15463 }
15464
15465 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15467 SDLoc SL(N);
15468 SDValue Idx = N->getOperand(1);
15469 SDValue V;
15470 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15471 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15472 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15473 if (I == 0)
15474 V = Elt;
15475 else
15476 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15477 }
15478 return V;
15479 }
15480
15481 if (!DCI.isBeforeLegalize())
15482 return SDValue();
15483
15484 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15485 // elements. This exposes more load reduction opportunities by replacing
15486 // multiple small extract_vector_elements with a single 32-bit extract.
15487 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15488 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15489 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15490 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15491
15492 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15493 unsigned EltIdx = BitIndex / 32;
15494 unsigned LeftoverBitIdx = BitIndex % 32;
15495 SDLoc SL(N);
15496
15497 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15498 DCI.AddToWorklist(Cast.getNode());
15499
15500 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15501 DAG.getConstant(EltIdx, SL, MVT::i32));
15502 DCI.AddToWorklist(Elt.getNode());
15503 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15504 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15505 DCI.AddToWorklist(Srl.getNode());
15506
15507 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15508 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15509 DCI.AddToWorklist(Trunc.getNode());
15510
15511 if (VecEltVT == ResVT) {
15512 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15513 }
15514
15515 assert(ResVT.isScalarInteger());
15516 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15517 }
15518
15519 return SDValue();
15520}
15521
15522SDValue
15523SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15524 DAGCombinerInfo &DCI) const {
15525 SDValue Vec = N->getOperand(0);
15526 SDValue Idx = N->getOperand(2);
15527 EVT VecVT = Vec.getValueType();
15528 EVT EltVT = VecVT.getVectorElementType();
15529
15530 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15531 // => BUILD_VECTOR n x select (e, const-idx)
15533 return SDValue();
15534
15535 SelectionDAG &DAG = DCI.DAG;
15536 SDLoc SL(N);
15537 SDValue Ins = N->getOperand(1);
15538 EVT IdxVT = Idx.getValueType();
15539
15541 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15542 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15543 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15544 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15545 Ops.push_back(V);
15546 }
15547
15548 return DAG.getBuildVector(VecVT, SL, Ops);
15549}
15550
15551/// Return the source of an fp_extend from f16 to f32, or a converted FP
15552/// constant.
15554 if (Src.getOpcode() == ISD::FP_EXTEND &&
15555 Src.getOperand(0).getValueType() == MVT::f16) {
15556 return Src.getOperand(0);
15557 }
15558
15559 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15560 APFloat Val = CFP->getValueAPF();
15561 bool LosesInfo = true;
15563 if (!LosesInfo)
15564 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15565 }
15566
15567 return SDValue();
15568}
15569
15570SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15571 DAGCombinerInfo &DCI) const {
15572 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15573 "combine only useful on gfx8");
15574
15575 SDValue TruncSrc = N->getOperand(0);
15576 EVT VT = N->getValueType(0);
15577 if (VT != MVT::f16)
15578 return SDValue();
15579
15580 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15581 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15582 return SDValue();
15583
15584 SelectionDAG &DAG = DCI.DAG;
15585 SDLoc SL(N);
15586
15587 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15588 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15589 // casting back.
15590
15591 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15592 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15593 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15594 if (!A)
15595 return SDValue();
15596
15597 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15598 if (!B)
15599 return SDValue();
15600
15601 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15602 if (!C)
15603 return SDValue();
15604
15605 // This changes signaling nan behavior. If an input is a signaling nan, it
15606 // would have been quieted by the fpext originally. We don't care because
15607 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15608 // we would be worse off than just doing the promotion.
15609 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15610 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15611 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15612 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15613}
15614
15615unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15616 const SDNode *N0,
15617 const SDNode *N1) const {
15618 EVT VT = N0->getValueType(0);
15619
15620 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15621 // support denormals ever.
15622 if (((VT == MVT::f32 &&
15624 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15627 return ISD::FMAD;
15628
15629 const TargetOptions &Options = DAG.getTarget().Options;
15630 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15631 (N0->getFlags().hasAllowContract() &&
15632 N1->getFlags().hasAllowContract())) &&
15634 return ISD::FMA;
15635 }
15636
15637 return 0;
15638}
15639
15640// For a reassociatable opcode perform:
15641// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15642SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15643 SelectionDAG &DAG) const {
15644 EVT VT = N->getValueType(0);
15645 if (VT != MVT::i32 && VT != MVT::i64)
15646 return SDValue();
15647
15648 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15649 return SDValue();
15650
15651 unsigned Opc = N->getOpcode();
15652 SDValue Op0 = N->getOperand(0);
15653 SDValue Op1 = N->getOperand(1);
15654
15655 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15656 return SDValue();
15657
15658 if (Op0->isDivergent())
15659 std::swap(Op0, Op1);
15660
15661 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15662 return SDValue();
15663
15664 SDValue Op2 = Op1.getOperand(1);
15665 Op1 = Op1.getOperand(0);
15666 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15667 return SDValue();
15668
15669 if (Op1->isDivergent())
15670 std::swap(Op1, Op2);
15671
15672 SDLoc SL(N);
15673 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15674 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15675}
15676
15677static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15678 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15680 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15681 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15682 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15683}
15684
15685// Fold
15686// y = lshr i64 x, 32
15687// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15688// with Const.hi == -1
15689// To
15690// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15692 SDValue MulLHS, SDValue MulRHS,
15693 SDValue AddRHS) {
15694 if (MulRHS.getOpcode() == ISD::SRL)
15695 std::swap(MulLHS, MulRHS);
15696
15697 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15698 return SDValue();
15699
15700 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15701 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15702 MulLHS.getOperand(0) != AddRHS)
15703 return SDValue();
15704
15706 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15707 return SDValue();
15708
15709 SDValue ConstMul =
15710 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15711 return getMad64_32(DAG, SL, MVT::i64,
15712 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15713 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15714}
15715
15716// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15717// multiplies, if any.
15718//
15719// Full 64-bit multiplies that feed into an addition are lowered here instead
15720// of using the generic expansion. The generic expansion ends up with
15721// a tree of ADD nodes that prevents us from using the "add" part of the
15722// MAD instruction. The expansion produced here results in a chain of ADDs
15723// instead of a tree.
15724SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15725 DAGCombinerInfo &DCI) const {
15726 assert(N->isAnyAdd());
15727
15728 SelectionDAG &DAG = DCI.DAG;
15729 EVT VT = N->getValueType(0);
15730 SDLoc SL(N);
15731 SDValue LHS = N->getOperand(0);
15732 SDValue RHS = N->getOperand(1);
15733
15734 if (VT.isVector())
15735 return SDValue();
15736
15737 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15738 // result in scalar registers for uniform values.
15739 if (!N->isDivergent() && Subtarget->hasSMulHi())
15740 return SDValue();
15741
15742 unsigned NumBits = VT.getScalarSizeInBits();
15743 if (NumBits <= 32 || NumBits > 64)
15744 return SDValue();
15745
15746 if (LHS.getOpcode() != ISD::MUL) {
15747 assert(RHS.getOpcode() == ISD::MUL);
15748 std::swap(LHS, RHS);
15749 }
15750
15751 // Avoid the fold if it would unduly increase the number of multiplies due to
15752 // multiple uses, except on hardware with full-rate multiply-add (which is
15753 // part of full-rate 64-bit ops).
15754 if (!Subtarget->hasFullRate64Ops()) {
15755 unsigned NumUsers = 0;
15756 for (SDNode *User : LHS->users()) {
15757 // There is a use that does not feed into addition, so the multiply can't
15758 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15759 if (!User->isAnyAdd())
15760 return SDValue();
15761
15762 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15763 // MUL + 3xADD + 3xADDC over 3xMAD.
15764 ++NumUsers;
15765 if (NumUsers >= 3)
15766 return SDValue();
15767 }
15768 }
15769
15770 SDValue MulLHS = LHS.getOperand(0);
15771 SDValue MulRHS = LHS.getOperand(1);
15772 SDValue AddRHS = RHS;
15773
15774 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15775 return FoldedMAD;
15776
15777 // Always check whether operands are small unsigned values, since that
15778 // knowledge is useful in more cases. Check for small signed values only if
15779 // doing so can unlock a shorter code sequence.
15780 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15781 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15782
15783 bool MulSignedLo = false;
15784 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15785 MulSignedLo =
15786 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15787 }
15788
15789 // The operands and final result all have the same number of bits. If
15790 // operands need to be extended, they can be extended with garbage. The
15791 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15792 // truncated away in the end.
15793 if (VT != MVT::i64) {
15794 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15795 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15796 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15797 }
15798
15799 // The basic code generated is conceptually straightforward. Pseudo code:
15800 //
15801 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15802 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15803 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15804 //
15805 // The second and third lines are optional, depending on whether the factors
15806 // are {sign,zero}-extended or not.
15807 //
15808 // The actual DAG is noisier than the pseudo code, but only due to
15809 // instructions that disassemble values into low and high parts, and
15810 // assemble the final result.
15811 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15812
15813 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15814 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15815 SDValue Accum =
15816 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15817
15818 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15819 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15820
15821 if (!MulLHSUnsigned32) {
15822 auto MulLHSHi =
15823 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15824 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15825 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15826 }
15827
15828 if (!MulRHSUnsigned32) {
15829 auto MulRHSHi =
15830 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15831 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15832 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15833 }
15834
15835 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15836 Accum = DAG.getBitcast(MVT::i64, Accum);
15837 }
15838
15839 if (VT != MVT::i64)
15840 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15841 return Accum;
15842}
15843
15844SDValue
15845SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15846 DAGCombinerInfo &DCI) const {
15847 SDValue RHS = N->getOperand(1);
15848 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15849 if (!CRHS)
15850 return SDValue();
15851
15852 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15853 // common.
15854 uint64_t Val = CRHS->getZExtValue();
15855 if (countr_zero(Val) >= 32) {
15856 SelectionDAG &DAG = DCI.DAG;
15857 SDLoc SL(N);
15858 SDValue LHS = N->getOperand(0);
15859
15860 // Avoid carry machinery if we know the low half of the add does not
15861 // contribute to the final result.
15862 //
15863 // add i64:x, K if computeTrailingZeros(K) >= 32
15864 // => build_pair (add x.hi, K.hi), x.lo
15865
15866 // Breaking the 64-bit add here with this strange constant is unlikely
15867 // to interfere with addressing mode patterns.
15868
15869 SDValue Hi = getHiHalf64(LHS, DAG);
15870 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15871 unsigned Opcode = N->getOpcode();
15872 if (Opcode == ISD::PTRADD)
15873 Opcode = ISD::ADD;
15874 SDValue AddHi =
15875 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15876
15877 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15878 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15879 }
15880
15881 return SDValue();
15882}
15883
15884// Collect the ultimate src of each of the mul node's operands, and confirm
15885// each operand is 8 bytes.
15886static std::optional<ByteProvider<SDValue>>
15887handleMulOperand(const SDValue &MulOperand) {
15888 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15889 if (!Byte0 || Byte0->isConstantZero()) {
15890 return std::nullopt;
15891 }
15892 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15893 if (Byte1 && !Byte1->isConstantZero()) {
15894 return std::nullopt;
15895 }
15896 return Byte0;
15897}
15898
15899static unsigned addPermMasks(unsigned First, unsigned Second) {
15900 unsigned FirstCs = First & 0x0c0c0c0c;
15901 unsigned SecondCs = Second & 0x0c0c0c0c;
15902 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15903 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15904
15905 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15906 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15907 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15908 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15909
15910 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15911}
15912
15913struct DotSrc {
15915 int64_t PermMask;
15917};
15918
15922 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15923
15924 assert(Src0.Src.has_value() && Src1.Src.has_value());
15925 // Src0s and Src1s are empty, just place arbitrarily.
15926 if (Step == 0) {
15927 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15928 Src0.SrcOffset / 4});
15929 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15930 Src1.SrcOffset / 4});
15931 return;
15932 }
15933
15934 for (int BPI = 0; BPI < 2; BPI++) {
15935 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15936 if (BPI == 1) {
15937 BPP = {Src1, Src0};
15938 }
15939 unsigned ZeroMask = 0x0c0c0c0c;
15940 unsigned FMask = 0xFF << (8 * (3 - Step));
15941
15942 unsigned FirstMask =
15943 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15944 unsigned SecondMask =
15945 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15946 // Attempt to find Src vector which contains our SDValue, if so, add our
15947 // perm mask to the existing one. If we are unable to find a match for the
15948 // first SDValue, attempt to find match for the second.
15949 int FirstGroup = -1;
15950 for (int I = 0; I < 2; I++) {
15951 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15952 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15953 return IterElt.SrcOp == *BPP.first.Src &&
15954 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15955 };
15956
15957 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15958 if (Match != Srcs.end()) {
15959 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15960 FirstGroup = I;
15961 break;
15962 }
15963 }
15964 if (FirstGroup != -1) {
15965 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15966 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15967 return IterElt.SrcOp == *BPP.second.Src &&
15968 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15969 };
15970 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15971 if (Match != Srcs.end()) {
15972 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15973 } else
15974 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15975 return;
15976 }
15977 }
15978
15979 // If we have made it here, then we could not find a match in Src0s or Src1s
15980 // for either Src0 or Src1, so just place them arbitrarily.
15981
15982 unsigned ZeroMask = 0x0c0c0c0c;
15983 unsigned FMask = 0xFF << (8 * (3 - Step));
15984
15985 Src0s.push_back(
15986 {*Src0.Src,
15987 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15988 Src0.SrcOffset / 4});
15989 Src1s.push_back(
15990 {*Src1.Src,
15991 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15992 Src1.SrcOffset / 4});
15993}
15994
15996 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15997 bool IsAny) {
15998
15999 // If we just have one source, just permute it accordingly.
16000 if (Srcs.size() == 1) {
16001 auto *Elt = Srcs.begin();
16002 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
16003
16004 // v_perm will produce the original value
16005 if (Elt->PermMask == 0x3020100)
16006 return EltOp;
16007
16008 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16009 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
16010 }
16011
16012 auto *FirstElt = Srcs.begin();
16013 auto *SecondElt = std::next(FirstElt);
16014
16016
16017 // If we have multiple sources in the chain, combine them via perms (using
16018 // calculated perm mask) and Ors.
16019 while (true) {
16020 auto FirstMask = FirstElt->PermMask;
16021 auto SecondMask = SecondElt->PermMask;
16022
16023 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16024 unsigned FirstPlusFour = FirstMask | 0x04040404;
16025 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16026 // original 0x0C.
16027 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16028
16029 auto PermMask = addPermMasks(FirstMask, SecondMask);
16030 auto FirstVal =
16031 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16032 auto SecondVal =
16033 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
16034
16035 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
16036 SecondVal,
16037 DAG.getConstant(PermMask, SL, MVT::i32)));
16038
16039 FirstElt = std::next(SecondElt);
16040 if (FirstElt == Srcs.end())
16041 break;
16042
16043 SecondElt = std::next(FirstElt);
16044 // If we only have a FirstElt, then just combine that into the cumulative
16045 // source node.
16046 if (SecondElt == Srcs.end()) {
16047 auto EltOp =
16048 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16049
16050 Perms.push_back(
16051 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16052 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
16053 break;
16054 }
16055 }
16056
16057 assert(Perms.size() == 1 || Perms.size() == 2);
16058 return Perms.size() == 2
16059 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
16060 : Perms[0];
16061}
16062
16063static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16064 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16065 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16066 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16067 EntryMask += ZeroMask;
16068 }
16069}
16070
16071static bool isMul(const SDValue Op) {
16072 auto Opcode = Op.getOpcode();
16073
16074 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16075 Opcode == AMDGPUISD::MUL_I24);
16076}
16077
16078static std::optional<bool>
16080 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16081 const SDValue &S1Op, const SelectionDAG &DAG) {
16082 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16083 // of the dot4 is irrelevant.
16084 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16085 return false;
16086
16087 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16088 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16089 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16090 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16091 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16092 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16093
16094 assert(!(S0IsUnsigned && S0IsSigned));
16095 assert(!(S1IsUnsigned && S1IsSigned));
16096
16097 // There are 9 possible permutations of
16098 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16099
16100 // In two permutations, the sign bits are known to be the same for both Ops,
16101 // so simply return Signed / Unsigned corresponding to the MSB
16102
16103 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16104 return S0IsSigned;
16105
16106 // In another two permutations, the sign bits are known to be opposite. In
16107 // this case return std::nullopt to indicate a bad match.
16108
16109 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16110 return std::nullopt;
16111
16112 // In the remaining five permutations, we don't know the value of the sign
16113 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16114 // the upper bits must be extension bits. Thus, the only ways for the sign
16115 // bit to be unknown is if it was sign extended from unknown value, or if it
16116 // was any extended. In either case, it is correct to use the signed
16117 // version of the signedness semantics of dot4
16118
16119 // In two of such permutations, we known the sign bit is set for
16120 // one op, and the other is unknown. It is okay to used signed version of
16121 // dot4.
16122 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16123 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16124 return true;
16125
16126 // In one such permutation, we don't know either of the sign bits. It is okay
16127 // to used the signed version of dot4.
16128 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16129 return true;
16130
16131 // In two of such permutations, we known the sign bit is unset for
16132 // one op, and the other is unknown. Return std::nullopt to indicate a
16133 // bad match.
16134 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16135 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16136 return std::nullopt;
16137
16138 llvm_unreachable("Fully covered condition");
16139}
16140
16141SDValue SITargetLowering::performAddCombine(SDNode *N,
16142 DAGCombinerInfo &DCI) const {
16143 SelectionDAG &DAG = DCI.DAG;
16144 EVT VT = N->getValueType(0);
16145 SDLoc SL(N);
16146 SDValue LHS = N->getOperand(0);
16147 SDValue RHS = N->getOperand(1);
16148
16149 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16150 if (Subtarget->hasMad64_32()) {
16151 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16152 return Folded;
16153 }
16154 }
16155
16156 if (SDValue V = reassociateScalarOps(N, DAG)) {
16157 return V;
16158 }
16159
16160 if (VT == MVT::i64) {
16161 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16162 return Folded;
16163 }
16164
16165 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16166 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16167 SDValue TempNode(N, 0);
16168 std::optional<bool> IsSigned;
16172
16173 // Match the v_dot4 tree, while collecting src nodes.
16174 int ChainLength = 0;
16175 for (int I = 0; I < 4; I++) {
16176 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16177 if (MulIdx == -1)
16178 break;
16179 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16180 if (!Src0)
16181 break;
16182 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16183 if (!Src1)
16184 break;
16185
16186 auto IterIsSigned = checkDot4MulSignedness(
16187 TempNode->getOperand(MulIdx), *Src0, *Src1,
16188 TempNode->getOperand(MulIdx)->getOperand(0),
16189 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16190 if (!IterIsSigned)
16191 break;
16192 if (!IsSigned)
16193 IsSigned = *IterIsSigned;
16194 if (*IterIsSigned != *IsSigned)
16195 break;
16196 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16197 auto AddIdx = 1 - MulIdx;
16198 // Allow the special case where add (add (mul24, 0), mul24) became ->
16199 // add (mul24, mul24).
16200 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16201 Src2s.push_back(TempNode->getOperand(AddIdx));
16202 auto Src0 =
16203 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16204 if (!Src0)
16205 break;
16206 auto Src1 =
16207 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16208 if (!Src1)
16209 break;
16210 auto IterIsSigned = checkDot4MulSignedness(
16211 TempNode->getOperand(AddIdx), *Src0, *Src1,
16212 TempNode->getOperand(AddIdx)->getOperand(0),
16213 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16214 if (!IterIsSigned)
16215 break;
16216 assert(IsSigned);
16217 if (*IterIsSigned != *IsSigned)
16218 break;
16219 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16220 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16221 ChainLength = I + 2;
16222 break;
16223 }
16224
16225 TempNode = TempNode->getOperand(AddIdx);
16226 Src2s.push_back(TempNode);
16227 ChainLength = I + 1;
16228 if (TempNode->getNumOperands() < 2)
16229 break;
16230 LHS = TempNode->getOperand(0);
16231 RHS = TempNode->getOperand(1);
16232 }
16233
16234 if (ChainLength < 2)
16235 return SDValue();
16236
16237 // Masks were constructed with assumption that we would find a chain of
16238 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16239 // 0x0c) so they do not affect dot calculation.
16240 if (ChainLength < 4) {
16241 fixMasks(Src0s, ChainLength);
16242 fixMasks(Src1s, ChainLength);
16243 }
16244
16245 SDValue Src0, Src1;
16246
16247 // If we are just using a single source for both, and have permuted the
16248 // bytes consistently, we can just use the sources without permuting
16249 // (commutation).
16250 bool UseOriginalSrc = false;
16251 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16252 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16253 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16254 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16255 SmallVector<unsigned, 4> SrcBytes;
16256 auto Src0Mask = Src0s.begin()->PermMask;
16257 SrcBytes.push_back(Src0Mask & 0xFF000000);
16258 bool UniqueEntries = true;
16259 for (auto I = 1; I < 4; I++) {
16260 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16261
16262 if (is_contained(SrcBytes, NextByte)) {
16263 UniqueEntries = false;
16264 break;
16265 }
16266 SrcBytes.push_back(NextByte);
16267 }
16268
16269 if (UniqueEntries) {
16270 UseOriginalSrc = true;
16271
16272 auto *FirstElt = Src0s.begin();
16273 auto FirstEltOp =
16274 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16275
16276 auto *SecondElt = Src1s.begin();
16277 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16278 SecondElt->DWordOffset);
16279
16280 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16281 MVT::getIntegerVT(32));
16282 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16283 MVT::getIntegerVT(32));
16284 }
16285 }
16286
16287 if (!UseOriginalSrc) {
16288 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16289 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16290 }
16291
16292 assert(IsSigned);
16293 SDValue Src2 =
16294 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16295
16296 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16297 : Intrinsic::amdgcn_udot4,
16298 SL, MVT::i64);
16299
16300 assert(!VT.isVector());
16301 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16302 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16303
16304 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16305 }
16306
16307 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16308 return SDValue();
16309
16310 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16311 // add x, sext (setcc) => usubo_carry x, 0, setcc
16312 unsigned Opc = LHS.getOpcode();
16315 std::swap(RHS, LHS);
16316
16317 Opc = RHS.getOpcode();
16318 switch (Opc) {
16319 default:
16320 break;
16321 case ISD::ZERO_EXTEND:
16322 case ISD::SIGN_EXTEND:
16323 case ISD::ANY_EXTEND: {
16324 auto Cond = RHS.getOperand(0);
16325 // If this won't be a real VOPC output, we would still need to insert an
16326 // extra instruction anyway.
16327 if (!isBoolSGPR(Cond))
16328 break;
16329 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16330 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16332 return DAG.getNode(Opc, SL, VTList, Args);
16333 }
16334 case ISD::UADDO_CARRY: {
16335 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16336 if (!isNullConstant(RHS.getOperand(1)))
16337 break;
16338 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16339 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16340 }
16341 }
16342 return SDValue();
16343}
16344
16345SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16346 DAGCombinerInfo &DCI) const {
16347 SelectionDAG &DAG = DCI.DAG;
16348 SDLoc DL(N);
16349 EVT VT = N->getValueType(0);
16350 SDValue N0 = N->getOperand(0);
16351 SDValue N1 = N->getOperand(1);
16352
16353 // The following folds transform PTRADDs into regular arithmetic in cases
16354 // where the PTRADD wouldn't be folded as an immediate offset into memory
16355 // instructions anyway. They are target-specific in that other targets might
16356 // prefer to not lose information about the pointer arithmetic.
16357
16358 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16359 // Adapted from DAGCombiner::visitADDLikeCommutative.
16360 SDValue V, K;
16361 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16362 SDNodeFlags ShlFlags = N1->getFlags();
16363 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16364 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16365 // preserved.
16366 SDNodeFlags NewShlFlags =
16367 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16369 : SDNodeFlags();
16370 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16371 DCI.AddToWorklist(Inner.getNode());
16372 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16373 }
16374
16375 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16376 // performAddCombine.
16377 if (N1.getOpcode() == ISD::MUL) {
16378 if (Subtarget->hasMad64_32()) {
16379 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16380 return Folded;
16381 }
16382 }
16383
16384 // If the 32 low bits of the constant are all zero, there is nothing to fold
16385 // into an immediate offset, so it's better to eliminate the unnecessary
16386 // addition for the lower 32 bits than to preserve the PTRADD.
16387 // Analogous to a fold in performAddCombine.
16388 if (VT == MVT::i64) {
16389 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16390 return Folded;
16391 }
16392
16393 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16394 return SDValue();
16395
16396 SDValue X = N0;
16397 SDValue Y = N1.getOperand(0);
16398 SDValue Z = N1.getOperand(1);
16399 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16400 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16401
16402 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16403 Y->isDivergent() != Z->isDivergent()) {
16404 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16405 // y are uniform and z isn't.
16406 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16407 // z are uniform and y isn't.
16408 // The goal is to push uniform operands up in the computation, so that they
16409 // can be handled with scalar operations. We can't use reassociateScalarOps
16410 // for this since it requires two identical commutative operations to
16411 // reassociate.
16412 if (Y->isDivergent())
16413 std::swap(Y, Z);
16414 // If both additions in the original were NUW, reassociation preserves that.
16415 SDNodeFlags ReassocFlags =
16416 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16417 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16418 DCI.AddToWorklist(UniformInner.getNode());
16419 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16420 }
16421
16422 return SDValue();
16423}
16424
16425SDValue SITargetLowering::performSubCombine(SDNode *N,
16426 DAGCombinerInfo &DCI) const {
16427 SelectionDAG &DAG = DCI.DAG;
16428 EVT VT = N->getValueType(0);
16429
16430 if (VT == MVT::i64) {
16431 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16432 return Folded;
16433 }
16434
16435 if (VT != MVT::i32)
16436 return SDValue();
16437
16438 SDLoc SL(N);
16439 SDValue LHS = N->getOperand(0);
16440 SDValue RHS = N->getOperand(1);
16441
16442 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16443 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16444 unsigned Opc = RHS.getOpcode();
16445 switch (Opc) {
16446 default:
16447 break;
16448 case ISD::ZERO_EXTEND:
16449 case ISD::SIGN_EXTEND:
16450 case ISD::ANY_EXTEND: {
16451 auto Cond = RHS.getOperand(0);
16452 // If this won't be a real VOPC output, we would still need to insert an
16453 // extra instruction anyway.
16454 if (!isBoolSGPR(Cond))
16455 break;
16456 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16457 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16459 return DAG.getNode(Opc, SL, VTList, Args);
16460 }
16461 }
16462
16463 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16464 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16465 if (!isNullConstant(LHS.getOperand(1)))
16466 return SDValue();
16467 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16468 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16469 }
16470 return SDValue();
16471}
16472
16473SDValue
16474SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16475 DAGCombinerInfo &DCI) const {
16476
16477 if (N->getValueType(0) != MVT::i32)
16478 return SDValue();
16479
16480 if (!isNullConstant(N->getOperand(1)))
16481 return SDValue();
16482
16483 SelectionDAG &DAG = DCI.DAG;
16484 SDValue LHS = N->getOperand(0);
16485
16486 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16487 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16488 unsigned LHSOpc = LHS.getOpcode();
16489 unsigned Opc = N->getOpcode();
16490 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16491 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16492 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16493 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16494 }
16495 return SDValue();
16496}
16497
16498SDValue SITargetLowering::performFAddCombine(SDNode *N,
16499 DAGCombinerInfo &DCI) const {
16500 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16501 return SDValue();
16502
16503 SelectionDAG &DAG = DCI.DAG;
16504 EVT VT = N->getValueType(0);
16505
16506 SDLoc SL(N);
16507 SDValue LHS = N->getOperand(0);
16508 SDValue RHS = N->getOperand(1);
16509
16510 // These should really be instruction patterns, but writing patterns with
16511 // source modifiers is a pain.
16512
16513 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16514 if (LHS.getOpcode() == ISD::FADD) {
16515 SDValue A = LHS.getOperand(0);
16516 if (A == LHS.getOperand(1)) {
16517 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16518 if (FusedOp != 0) {
16519 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16520 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16521 }
16522 }
16523 }
16524
16525 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16526 if (RHS.getOpcode() == ISD::FADD) {
16527 SDValue A = RHS.getOperand(0);
16528 if (A == RHS.getOperand(1)) {
16529 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16530 if (FusedOp != 0) {
16531 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16532 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16533 }
16534 }
16535 }
16536
16537 return SDValue();
16538}
16539
16540SDValue SITargetLowering::performFSubCombine(SDNode *N,
16541 DAGCombinerInfo &DCI) const {
16542 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16543 return SDValue();
16544
16545 SelectionDAG &DAG = DCI.DAG;
16546 SDLoc SL(N);
16547 EVT VT = N->getValueType(0);
16548 assert(!VT.isVector());
16549
16550 // Try to get the fneg to fold into the source modifier. This undoes generic
16551 // DAG combines and folds them into the mad.
16552 //
16553 // Only do this if we are not trying to support denormals. v_mad_f32 does
16554 // not support denormals ever.
16555 SDValue LHS = N->getOperand(0);
16556 SDValue RHS = N->getOperand(1);
16557 if (LHS.getOpcode() == ISD::FADD) {
16558 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16559 SDValue A = LHS.getOperand(0);
16560 if (A == LHS.getOperand(1)) {
16561 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16562 if (FusedOp != 0) {
16563 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16564 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16565
16566 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16567 }
16568 }
16569 }
16570
16571 if (RHS.getOpcode() == ISD::FADD) {
16572 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16573
16574 SDValue A = RHS.getOperand(0);
16575 if (A == RHS.getOperand(1)) {
16576 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16577 if (FusedOp != 0) {
16578 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16579 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16580 }
16581 }
16582 }
16583
16584 return SDValue();
16585}
16586
16587SDValue SITargetLowering::performFDivCombine(SDNode *N,
16588 DAGCombinerInfo &DCI) const {
16589 SelectionDAG &DAG = DCI.DAG;
16590 SDLoc SL(N);
16591 EVT VT = N->getValueType(0);
16592
16593 // fsqrt legality correlates to rsq availability.
16594 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
16595 return SDValue();
16596
16597 SDValue LHS = N->getOperand(0);
16598 SDValue RHS = N->getOperand(1);
16599
16600 SDNodeFlags Flags = N->getFlags();
16601 SDNodeFlags RHSFlags = RHS->getFlags();
16602 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16603 !RHS->hasOneUse())
16604 return SDValue();
16605
16606 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16607 bool IsNegative = false;
16608 if (CLHS->isExactlyValue(1.0) ||
16609 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16610 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16611 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16612 if (RHS.getOpcode() == ISD::FSQRT) {
16613 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16614 SDValue Rsq =
16615 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16616 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16617 }
16618 }
16619 }
16620
16621 return SDValue();
16622}
16623
16624SDValue SITargetLowering::performFMulCombine(SDNode *N,
16625 DAGCombinerInfo &DCI) const {
16626 SelectionDAG &DAG = DCI.DAG;
16627 EVT VT = N->getValueType(0);
16628 EVT ScalarVT = VT.getScalarType();
16629 EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
16630
16631 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16632 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16633 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16634 return SDValue();
16635 }
16636
16637 SDValue LHS = N->getOperand(0);
16638 SDValue RHS = N->getOperand(1);
16639
16640 // It is cheaper to realize i32 inline constants as compared against
16641 // materializing f16 or f64 (or even non-inline f32) values,
16642 // possible via ldexp usage, as shown below :
16643 //
16644 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16645 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16646 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16647 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16648 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16649 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16650 if (!TrueNode)
16651 return SDValue();
16652 const ConstantFPSDNode *FalseNode =
16653 isConstOrConstSplatFP(RHS.getOperand(2));
16654 if (!FalseNode)
16655 return SDValue();
16656
16657 if (TrueNode->isNegative() != FalseNode->isNegative())
16658 return SDValue();
16659
16660 // For f32, only non-inline constants should be transformed.
16661 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16662 if (ScalarVT == MVT::f32 &&
16663 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16664 TII->isInlineConstant(FalseNode->getValueAPF()))
16665 return SDValue();
16666
16667 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16668 if (TrueNodeExpVal == INT_MIN)
16669 return SDValue();
16670 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16671 if (FalseNodeExpVal == INT_MIN)
16672 return SDValue();
16673
16674 SDLoc SL(N);
16675 SDValue SelectNode =
16676 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16677 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16678 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16679
16680 LHS = TrueNode->isNegative()
16681 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16682 : LHS;
16683
16684 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16685 }
16686
16687 return SDValue();
16688}
16689
16690SDValue SITargetLowering::performFMACombine(SDNode *N,
16691 DAGCombinerInfo &DCI) const {
16692 SelectionDAG &DAG = DCI.DAG;
16693 EVT VT = N->getValueType(0);
16694 SDLoc SL(N);
16695
16696 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16697 return SDValue();
16698
16699 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16700 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16701 SDValue Op1 = N->getOperand(0);
16702 SDValue Op2 = N->getOperand(1);
16703 SDValue FMA = N->getOperand(2);
16704
16705 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16706 Op2.getOpcode() != ISD::FP_EXTEND)
16707 return SDValue();
16708
16709 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16710 // regardless of the denorm mode setting. Therefore,
16711 // fp-contract is sufficient to allow generating fdot2.
16712 const TargetOptions &Options = DAG.getTarget().Options;
16713 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16714 (N->getFlags().hasAllowContract() &&
16715 FMA->getFlags().hasAllowContract())) {
16716 Op1 = Op1.getOperand(0);
16717 Op2 = Op2.getOperand(0);
16718 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16720 return SDValue();
16721
16722 SDValue Vec1 = Op1.getOperand(0);
16723 SDValue Idx1 = Op1.getOperand(1);
16724 SDValue Vec2 = Op2.getOperand(0);
16725
16726 SDValue FMAOp1 = FMA.getOperand(0);
16727 SDValue FMAOp2 = FMA.getOperand(1);
16728 SDValue FMAAcc = FMA.getOperand(2);
16729
16730 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16731 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16732 return SDValue();
16733
16734 FMAOp1 = FMAOp1.getOperand(0);
16735 FMAOp2 = FMAOp2.getOperand(0);
16736 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16738 return SDValue();
16739
16740 SDValue Vec3 = FMAOp1.getOperand(0);
16741 SDValue Vec4 = FMAOp2.getOperand(0);
16742 SDValue Idx2 = FMAOp1.getOperand(1);
16743
16744 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16745 // Idx1 and Idx2 cannot be the same.
16746 Idx1 == Idx2)
16747 return SDValue();
16748
16749 if (Vec1 == Vec2 || Vec3 == Vec4)
16750 return SDValue();
16751
16752 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16753 return SDValue();
16754
16755 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16756 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16757 DAG.getTargetConstant(0, SL, MVT::i1));
16758 }
16759 }
16760 return SDValue();
16761}
16762
16763SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16764 DAGCombinerInfo &DCI) const {
16765 SelectionDAG &DAG = DCI.DAG;
16766 SDLoc SL(N);
16767
16768 SDValue LHS = N->getOperand(0);
16769 SDValue RHS = N->getOperand(1);
16770 EVT VT = LHS.getValueType();
16771 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16772
16773 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16774 if (!CRHS) {
16776 if (CRHS) {
16777 std::swap(LHS, RHS);
16778 CC = getSetCCSwappedOperands(CC);
16779 }
16780 }
16781
16782 if (CRHS) {
16783 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16784 isBoolSGPR(LHS.getOperand(0))) {
16785 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16786 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16787 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16788 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16789 if ((CRHS->isAllOnes() &&
16790 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16791 (CRHS->isZero() &&
16792 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16793 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16794 DAG.getAllOnesConstant(SL, MVT::i1));
16795 if ((CRHS->isAllOnes() &&
16796 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16797 (CRHS->isZero() &&
16798 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16799 return LHS.getOperand(0);
16800 }
16801
16802 const APInt &CRHSVal = CRHS->getAPIntValue();
16803 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16804 LHS.getOpcode() == ISD::SELECT &&
16805 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16806 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16807 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16808 isBoolSGPR(LHS.getOperand(0))) {
16809 // Given CT != FT:
16810 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16811 // setcc (select cc, CT, CF), CF, ne => cc
16812 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16813 // setcc (select cc, CT, CF), CT, eq => cc
16814 const APInt &CT = LHS.getConstantOperandAPInt(1);
16815 const APInt &CF = LHS.getConstantOperandAPInt(2);
16816
16817 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16818 (CT == CRHSVal && CC == ISD::SETNE))
16819 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16820 DAG.getAllOnesConstant(SL, MVT::i1));
16821 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16822 (CT == CRHSVal && CC == ISD::SETEQ))
16823 return LHS.getOperand(0);
16824 }
16825 }
16826
16827 // Eliminate setcc by using carryout from add/sub instruction
16828
16829 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
16830 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
16831 // similarly for subtraction
16832
16833 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
16834 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
16835
16836 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16838 (CC == ISD::SETUGT &&
16840 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16841 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16842 bool IsAdd = LHS.getOpcode() == ISD::ADD;
16843
16844 SDValue Op0 = LHS.getOperand(0);
16845 SDValue Op1 = LHS.getOperand(1);
16846
16847 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
16848 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
16849
16850 SDValue Op0Hi = getHiHalf64(Op0, DAG);
16851 SDValue Op1Hi = getHiHalf64(Op1, DAG);
16852
16853 SDValue NodeLo =
16854 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16855 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16856
16857 SDValue CarryInHi = NodeLo.getValue(1);
16858 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16859 SL, DAG.getVTList(MVT::i32, MVT::i1),
16860 {Op0Hi, Op1Hi, CarryInHi});
16861
16862 SDValue ResultLo = NodeLo.getValue(0);
16863 SDValue ResultHi = NodeHi.getValue(0);
16864
16865 SDValue JoinedResult =
16866 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
16867
16868 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16869 SDValue Overflow = NodeHi.getValue(1);
16870 DCI.CombineTo(LHS.getNode(), Result);
16871 return Overflow;
16872 }
16873
16874 if (VT != MVT::f32 && VT != MVT::f64 &&
16875 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16876 return SDValue();
16877
16878 // Match isinf/isfinite pattern
16879 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16880 // (fcmp one (fabs x), inf) -> (fp_class x,
16881 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16882 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16883 LHS.getOpcode() == ISD::FABS) {
16884 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16885 if (!CRHS)
16886 return SDValue();
16887
16888 const APFloat &APF = CRHS->getValueAPF();
16889 if (APF.isInfinity() && !APF.isNegative()) {
16890 const unsigned IsInfMask =
16892 const unsigned IsFiniteMask =
16896 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16897 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16898 DAG.getConstant(Mask, SL, MVT::i32));
16899 }
16900 }
16901
16902 return SDValue();
16903}
16904
16905SDValue
16906SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16907 DAGCombinerInfo &DCI) const {
16908 SelectionDAG &DAG = DCI.DAG;
16909 SDLoc SL(N);
16910 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16911
16912 SDValue Src = N->getOperand(0);
16913 SDValue Shift = N->getOperand(0);
16914
16915 // TODO: Extend type shouldn't matter (assuming legal types).
16916 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16917 Shift = Shift.getOperand(0);
16918
16919 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16920 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16921 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16922 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16923 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16924 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16925 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16926 SDValue Shifted = DAG.getZExtOrTrunc(
16927 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16928
16929 unsigned ShiftOffset = 8 * Offset;
16930 if (Shift.getOpcode() == ISD::SHL)
16931 ShiftOffset -= C->getZExtValue();
16932 else
16933 ShiftOffset += C->getZExtValue();
16934
16935 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16936 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16937 MVT::f32, Shifted);
16938 }
16939 }
16940 }
16941
16942 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16943 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16944 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16945 // We simplified Src. If this node is not dead, visit it again so it is
16946 // folded properly.
16947 if (N->getOpcode() != ISD::DELETED_NODE)
16948 DCI.AddToWorklist(N);
16949 return SDValue(N, 0);
16950 }
16951
16952 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16953 if (SDValue DemandedSrc =
16954 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16955 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16956
16957 return SDValue();
16958}
16959
16960SDValue SITargetLowering::performClampCombine(SDNode *N,
16961 DAGCombinerInfo &DCI) const {
16962 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16963 if (!CSrc)
16964 return SDValue();
16965
16966 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16967 const APFloat &F = CSrc->getValueAPF();
16968 APFloat Zero = APFloat::getZero(F.getSemantics());
16969 if (F < Zero ||
16970 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16971 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16972 }
16973
16974 APFloat One(F.getSemantics(), "1.0");
16975 if (F > One)
16976 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16977
16978 return SDValue(CSrc, 0);
16979}
16980
16981SDValue SITargetLowering::performSelectCombine(SDNode *N,
16982 DAGCombinerInfo &DCI) const {
16983
16984 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16985 // integer).
16986 // Detect when CMP and SELECT use the same constant and fold them to avoid
16987 // loading the constant twice. Specifically handles patterns like:
16988 // %cmp = icmp eq i32 %val, 4242
16989 // %sel = select i1 %cmp, i32 4242, i32 %other
16990 // It can be optimized to reuse %val instead of 4242 in select.
16991 SDValue Cond = N->getOperand(0);
16992 SDValue TrueVal = N->getOperand(1);
16993 SDValue FalseVal = N->getOperand(2);
16994
16995 // Check if condition is a comparison.
16996 if (Cond.getOpcode() != ISD::SETCC)
16997 return SDValue();
16998
16999 SDValue LHS = Cond.getOperand(0);
17000 SDValue RHS = Cond.getOperand(1);
17001 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
17002
17003 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17004 bool isInteger = LHS.getValueType().isInteger();
17005
17006 // Handle simple floating-point and integer types only.
17007 if (!isFloatingPoint && !isInteger)
17008 return SDValue();
17009
17010 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17011 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17012 if (!isEquality && !isNonEquality)
17013 return SDValue();
17014
17015 SDValue ArgVal, ConstVal;
17016 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
17017 (isInteger && isa<ConstantSDNode>(RHS))) {
17018 ConstVal = RHS;
17019 ArgVal = LHS;
17020 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
17021 (isInteger && isa<ConstantSDNode>(LHS))) {
17022 ConstVal = LHS;
17023 ArgVal = RHS;
17024 } else {
17025 return SDValue();
17026 }
17027
17028 // Skip optimization for inlinable immediates.
17029 if (isFloatingPoint) {
17030 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
17031 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17032 return SDValue();
17033 } else {
17035 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
17036 return SDValue();
17037 }
17038
17039 // For equality and non-equality comparisons, patterns:
17040 // select (setcc x, const), const, y -> select (setcc x, const), x, y
17041 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17042 if (!(isEquality && TrueVal == ConstVal) &&
17043 !(isNonEquality && FalseVal == ConstVal))
17044 return SDValue();
17045
17046 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17047 SDValue SelectRHS =
17048 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17049 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
17050 SelectLHS, SelectRHS);
17051}
17052
17054 DAGCombinerInfo &DCI) const {
17055 switch (N->getOpcode()) {
17056 case ISD::ADD:
17057 case ISD::SUB:
17058 case ISD::SHL:
17059 case ISD::SRL:
17060 case ISD::SRA:
17061 case ISD::AND:
17062 case ISD::OR:
17063 case ISD::XOR:
17064 case ISD::MUL:
17065 case ISD::SETCC:
17066 case ISD::SELECT:
17067 case ISD::SMIN:
17068 case ISD::SMAX:
17069 case ISD::UMIN:
17070 case ISD::UMAX:
17071 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
17072 return Res;
17073 break;
17074 default:
17075 break;
17076 }
17077
17078 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17079 return SDValue();
17080
17081 switch (N->getOpcode()) {
17082 case ISD::ADD:
17083 return performAddCombine(N, DCI);
17084 case ISD::PTRADD:
17085 return performPtrAddCombine(N, DCI);
17086 case ISD::SUB:
17087 return performSubCombine(N, DCI);
17088 case ISD::UADDO_CARRY:
17089 case ISD::USUBO_CARRY:
17090 return performAddCarrySubCarryCombine(N, DCI);
17091 case ISD::FADD:
17092 return performFAddCombine(N, DCI);
17093 case ISD::FSUB:
17094 return performFSubCombine(N, DCI);
17095 case ISD::FDIV:
17096 return performFDivCombine(N, DCI);
17097 case ISD::FMUL:
17098 return performFMulCombine(N, DCI);
17099 case ISD::SETCC:
17100 return performSetCCCombine(N, DCI);
17101 case ISD::SELECT:
17102 if (auto Res = performSelectCombine(N, DCI))
17103 return Res;
17104 break;
17105 case ISD::FMAXNUM:
17106 case ISD::FMINNUM:
17107 case ISD::FMAXNUM_IEEE:
17108 case ISD::FMINNUM_IEEE:
17109 case ISD::FMAXIMUM:
17110 case ISD::FMINIMUM:
17111 case ISD::FMAXIMUMNUM:
17112 case ISD::FMINIMUMNUM:
17113 case ISD::SMAX:
17114 case ISD::SMIN:
17115 case ISD::UMAX:
17116 case ISD::UMIN:
17117 case AMDGPUISD::FMIN_LEGACY:
17118 case AMDGPUISD::FMAX_LEGACY:
17119 return performMinMaxCombine(N, DCI);
17120 case ISD::FMA:
17121 return performFMACombine(N, DCI);
17122 case ISD::AND:
17123 return performAndCombine(N, DCI);
17124 case ISD::OR:
17125 return performOrCombine(N, DCI);
17126 case ISD::FSHR: {
17128 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
17129 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17130 return matchPERM(N, DCI);
17131 }
17132 break;
17133 }
17134 case ISD::XOR:
17135 return performXorCombine(N, DCI);
17136 case ISD::ZERO_EXTEND:
17137 return performZeroExtendCombine(N, DCI);
17139 return performSignExtendInRegCombine(N, DCI);
17140 case AMDGPUISD::FP_CLASS:
17141 return performClassCombine(N, DCI);
17142 case ISD::FCANONICALIZE:
17143 return performFCanonicalizeCombine(N, DCI);
17144 case AMDGPUISD::RCP:
17145 return performRcpCombine(N, DCI);
17146 case ISD::FLDEXP:
17147 case AMDGPUISD::FRACT:
17148 case AMDGPUISD::RSQ:
17149 case AMDGPUISD::RCP_LEGACY:
17150 case AMDGPUISD::RCP_IFLAG:
17151 case AMDGPUISD::RSQ_CLAMP: {
17152 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17153 SDValue Src = N->getOperand(0);
17154 if (Src.isUndef())
17155 return Src;
17156 break;
17157 }
17158 case ISD::SINT_TO_FP:
17159 case ISD::UINT_TO_FP:
17160 return performUCharToFloatCombine(N, DCI);
17161 case ISD::FCOPYSIGN:
17162 return performFCopySignCombine(N, DCI);
17163 case AMDGPUISD::CVT_F32_UBYTE0:
17164 case AMDGPUISD::CVT_F32_UBYTE1:
17165 case AMDGPUISD::CVT_F32_UBYTE2:
17166 case AMDGPUISD::CVT_F32_UBYTE3:
17167 return performCvtF32UByteNCombine(N, DCI);
17168 case AMDGPUISD::FMED3:
17169 return performFMed3Combine(N, DCI);
17170 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17171 return performCvtPkRTZCombine(N, DCI);
17172 case AMDGPUISD::CLAMP:
17173 return performClampCombine(N, DCI);
17174 case ISD::SCALAR_TO_VECTOR: {
17175 SelectionDAG &DAG = DCI.DAG;
17176 EVT VT = N->getValueType(0);
17177
17178 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17179 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17180 SDLoc SL(N);
17181 SDValue Src = N->getOperand(0);
17182 EVT EltVT = Src.getValueType();
17183 if (EltVT != MVT::i16)
17184 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17185
17186 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17187 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17188 }
17189
17190 break;
17191 }
17193 return performExtractVectorEltCombine(N, DCI);
17195 return performInsertVectorEltCombine(N, DCI);
17196 case ISD::FP_ROUND:
17197 return performFPRoundCombine(N, DCI);
17198 case ISD::LOAD: {
17199 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17200 return Widened;
17201 [[fallthrough]];
17202 }
17203 default: {
17204 if (!DCI.isBeforeLegalize()) {
17205 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17206 return performMemSDNodeCombine(MemNode, DCI);
17207 }
17208
17209 break;
17210 }
17211 }
17212
17214}
17215
17216/// Helper function for adjustWritemask
17217static unsigned SubIdx2Lane(unsigned Idx) {
17218 switch (Idx) {
17219 default:
17220 return ~0u;
17221 case AMDGPU::sub0:
17222 return 0;
17223 case AMDGPU::sub1:
17224 return 1;
17225 case AMDGPU::sub2:
17226 return 2;
17227 case AMDGPU::sub3:
17228 return 3;
17229 case AMDGPU::sub4:
17230 return 4; // Possible with TFE/LWE
17231 }
17232}
17233
17234/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17235SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17236 SelectionDAG &DAG) const {
17237 unsigned Opcode = Node->getMachineOpcode();
17238
17239 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17240 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17241 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17242 return Node; // not implemented for D16
17243
17244 SDNode *Users[5] = {nullptr};
17245 unsigned Lane = 0;
17246 unsigned DmaskIdx =
17247 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17248 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17249 unsigned NewDmask = 0;
17250 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17251 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17252 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17253 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17254 unsigned TFCLane = 0;
17255 bool HasChain = Node->getNumValues() > 1;
17256
17257 if (OldDmask == 0) {
17258 // These are folded out, but on the chance it happens don't assert.
17259 return Node;
17260 }
17261
17262 unsigned OldBitsSet = llvm::popcount(OldDmask);
17263 // Work out which is the TFE/LWE lane if that is enabled.
17264 if (UsesTFC) {
17265 TFCLane = OldBitsSet;
17266 }
17267
17268 // Try to figure out the used register components
17269 for (SDUse &Use : Node->uses()) {
17270
17271 // Don't look at users of the chain.
17272 if (Use.getResNo() != 0)
17273 continue;
17274
17275 SDNode *User = Use.getUser();
17276
17277 // Abort if we can't understand the usage
17278 if (!User->isMachineOpcode() ||
17279 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17280 return Node;
17281
17282 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17283 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17284 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17285 // set, etc.
17286 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17287 if (Lane == ~0u)
17288 return Node;
17289
17290 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17291 if (UsesTFC && Lane == TFCLane) {
17292 Users[Lane] = User;
17293 } else {
17294 // Set which texture component corresponds to the lane.
17295 unsigned Comp;
17296 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17297 Comp = llvm::countr_zero(Dmask);
17298 Dmask &= ~(1 << Comp);
17299 }
17300
17301 // Abort if we have more than one user per component.
17302 if (Users[Lane])
17303 return Node;
17304
17305 Users[Lane] = User;
17306 NewDmask |= 1 << Comp;
17307 }
17308 }
17309
17310 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17311 bool NoChannels = !NewDmask;
17312 if (NoChannels) {
17313 if (!UsesTFC) {
17314 // No uses of the result and not using TFC. Then do nothing.
17315 return Node;
17316 }
17317 // If the original dmask has one channel - then nothing to do
17318 if (OldBitsSet == 1)
17319 return Node;
17320 // Use an arbitrary dmask - required for the instruction to work
17321 NewDmask = 1;
17322 }
17323 // Abort if there's no change
17324 if (NewDmask == OldDmask)
17325 return Node;
17326
17327 unsigned BitsSet = llvm::popcount(NewDmask);
17328
17329 // Check for TFE or LWE - increase the number of channels by one to account
17330 // for the extra return value
17331 // This will need adjustment for D16 if this is also included in
17332 // adjustWriteMask (this function) but at present D16 are excluded.
17333 unsigned NewChannels = BitsSet + UsesTFC;
17334
17335 int NewOpcode =
17336 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17337 assert(NewOpcode != -1 &&
17338 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17339 "failed to find equivalent MIMG op");
17340
17341 // Adjust the writemask in the node
17343 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17344 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17345 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17346
17347 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17348
17349 MVT ResultVT = NewChannels == 1
17350 ? SVT
17351 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17352 : NewChannels == 5 ? 8
17353 : NewChannels);
17354 SDVTList NewVTList =
17355 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17356
17357 MachineSDNode *NewNode =
17358 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17359
17360 if (HasChain) {
17361 // Update chain.
17362 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17363 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17364 }
17365
17366 if (NewChannels == 1) {
17367 assert(Node->hasNUsesOfValue(1, 0));
17368 SDNode *Copy =
17369 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17370 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17371 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17372 return nullptr;
17373 }
17374
17375 // Update the users of the node with the new indices
17376 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17377 SDNode *User = Users[i];
17378 if (!User) {
17379 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17380 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17381 if (i || !NoChannels)
17382 continue;
17383 } else {
17384 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17385 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17386 if (NewUser != User) {
17387 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17388 DAG.RemoveDeadNode(User);
17389 }
17390 }
17391
17392 switch (Idx) {
17393 default:
17394 break;
17395 case AMDGPU::sub0:
17396 Idx = AMDGPU::sub1;
17397 break;
17398 case AMDGPU::sub1:
17399 Idx = AMDGPU::sub2;
17400 break;
17401 case AMDGPU::sub2:
17402 Idx = AMDGPU::sub3;
17403 break;
17404 case AMDGPU::sub3:
17405 Idx = AMDGPU::sub4;
17406 break;
17407 }
17408 }
17409
17410 DAG.RemoveDeadNode(Node);
17411 return nullptr;
17412}
17413
17415 if (Op.getOpcode() == ISD::AssertZext)
17416 Op = Op.getOperand(0);
17417
17418 return isa<FrameIndexSDNode>(Op);
17419}
17420
17421/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17422/// with frame index operands.
17423/// LLVM assumes that inputs are to these instructions are registers.
17424SDNode *
17426 SelectionDAG &DAG) const {
17427 if (Node->getOpcode() == ISD::CopyToReg) {
17428 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17429 SDValue SrcVal = Node->getOperand(2);
17430
17431 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17432 // to try understanding copies to physical registers.
17433 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17434 SDLoc SL(Node);
17436 SDValue VReg = DAG.getRegister(
17437 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17438
17439 SDNode *Glued = Node->getGluedNode();
17440 SDValue ToVReg = DAG.getCopyToReg(
17441 Node->getOperand(0), SL, VReg, SrcVal,
17442 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17443 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17444 VReg, ToVReg.getValue(1));
17445 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17446 DAG.RemoveDeadNode(Node);
17447 return ToResultReg.getNode();
17448 }
17449 }
17450
17452 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17453 if (!isFrameIndexOp(Node->getOperand(i))) {
17454 Ops.push_back(Node->getOperand(i));
17455 continue;
17456 }
17457
17458 SDLoc DL(Node);
17459 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17460 Node->getOperand(i).getValueType(),
17461 Node->getOperand(i)),
17462 0));
17463 }
17464
17465 return DAG.UpdateNodeOperands(Node, Ops);
17466}
17467
17468/// Fold the instructions after selecting them.
17469/// Returns null if users were already updated.
17471 SelectionDAG &DAG) const {
17473 unsigned Opcode = Node->getMachineOpcode();
17474
17475 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17476 !TII->isGather4(Opcode) &&
17477 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17478 return adjustWritemask(Node, DAG);
17479 }
17480
17481 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17483 return Node;
17484 }
17485
17486 switch (Opcode) {
17487 case AMDGPU::V_DIV_SCALE_F32_e64:
17488 case AMDGPU::V_DIV_SCALE_F64_e64: {
17489 // Satisfy the operand register constraint when one of the inputs is
17490 // undefined. Ordinarily each undef value will have its own implicit_def of
17491 // a vreg, so force these to use a single register.
17492 SDValue Src0 = Node->getOperand(1);
17493 SDValue Src1 = Node->getOperand(3);
17494 SDValue Src2 = Node->getOperand(5);
17495
17496 if ((Src0.isMachineOpcode() &&
17497 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17498 (Src0 == Src1 || Src0 == Src2))
17499 break;
17500
17501 MVT VT = Src0.getValueType().getSimpleVT();
17502 const TargetRegisterClass *RC =
17503 getRegClassFor(VT, Src0.getNode()->isDivergent());
17504
17506 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17507
17508 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17509 Src0, SDValue());
17510
17511 // src0 must be the same register as src1 or src2, even if the value is
17512 // undefined, so make sure we don't violate this constraint.
17513 if (Src0.isMachineOpcode() &&
17514 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17515 if (Src1.isMachineOpcode() &&
17516 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17517 Src0 = Src1;
17518 else if (Src2.isMachineOpcode() &&
17519 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17520 Src0 = Src2;
17521 else {
17522 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17523 Src0 = UndefReg;
17524 Src1 = UndefReg;
17525 }
17526 } else
17527 break;
17528
17530 Ops[1] = Src0;
17531 Ops[3] = Src1;
17532 Ops[5] = Src2;
17533 Ops.push_back(ImpDef.getValue(1));
17534 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17535 }
17536 default:
17537 break;
17538 }
17539
17540 return Node;
17541}
17542
17543// Any MIMG instructions that use tfe or lwe require an initialization of the
17544// result register that will be written in the case of a memory access failure.
17545// The required code is also added to tie this init code to the result of the
17546// img instruction.
17549 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17550 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17551 MachineBasicBlock &MBB = *MI.getParent();
17552
17553 int DstIdx =
17554 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17555 unsigned InitIdx = 0;
17556
17557 if (TII->isImage(MI)) {
17558 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17559 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17560 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17561
17562 if (!TFE && !LWE) // intersect_ray
17563 return;
17564
17565 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17566 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17567 unsigned D16Val = D16 ? D16->getImm() : 0;
17568
17569 if (!TFEVal && !LWEVal)
17570 return;
17571
17572 // At least one of TFE or LWE are non-zero
17573 // We have to insert a suitable initialization of the result value and
17574 // tie this to the dest of the image instruction.
17575
17576 // Calculate which dword we have to initialize to 0.
17577 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17578
17579 // check that dmask operand is found.
17580 assert(MO_Dmask && "Expected dmask operand in instruction");
17581
17582 unsigned dmask = MO_Dmask->getImm();
17583 // Determine the number of active lanes taking into account the
17584 // Gather4 special case
17585 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17586
17587 bool Packed = !Subtarget->hasUnpackedD16VMem();
17588
17589 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17590
17591 // Abandon attempt if the dst size isn't large enough
17592 // - this is in fact an error but this is picked up elsewhere and
17593 // reported correctly.
17594 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17595
17596 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
17597 if (DstSize < InitIdx)
17598 return;
17599 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17600 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17601 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
17602 } else {
17603 return;
17604 }
17605
17606 const DebugLoc &DL = MI.getDebugLoc();
17607
17608 // Create a register for the initialization value.
17609 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17610 unsigned NewDst = 0; // Final initialized value will be in here
17611
17612 // If PRTStrictNull feature is enabled (the default) then initialize
17613 // all the result registers to 0, otherwise just the error indication
17614 // register (VGPRn+1)
17615 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17616 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17617
17618 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17619 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17620 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17621 // Initialize dword
17622 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17623 // clang-format off
17624 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17625 .addImm(0);
17626 // clang-format on
17627 // Insert into the super-reg
17628 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17629 .addReg(PrevDst)
17630 .addReg(SubReg)
17632
17633 PrevDst = NewDst;
17634 }
17635
17636 // Add as an implicit operand
17637 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17638
17639 // Tie the just added implicit operand to the dst
17640 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17641}
17642
17643/// Assign the register class depending on the number of
17644/// bits set in the writemask
17646 SDNode *Node) const {
17648
17649 MachineFunction *MF = MI.getMF();
17651
17652 if (TII->isVOP3(MI.getOpcode())) {
17653 // Make sure constant bus requirements are respected.
17654 TII->legalizeOperandsVOP3(MRI, MI);
17655
17656 if (TII->isMAI(MI)) {
17657 // The ordinary src0, src1, src2 were legalized above.
17658 //
17659 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17660 // as a separate instruction.
17661 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17662 AMDGPU::OpName::scale_src0);
17663 if (Src0Idx != -1) {
17664 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17665 AMDGPU::OpName::scale_src1);
17666 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17667 TII->usesConstantBus(MRI, MI, Src1Idx))
17668 TII->legalizeOpWithMove(MI, Src1Idx);
17669 }
17670 }
17671
17672 return;
17673 }
17674
17675 if (TII->isImage(MI))
17676 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17677}
17678
17680 uint64_t Val) {
17681 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17682 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17683}
17684
17686 const SDLoc &DL,
17687 SDValue Ptr) const {
17689
17690 // Build the half of the subregister with the constants before building the
17691 // full 128-bit register. If we are building multiple resource descriptors,
17692 // this will allow CSEing of the 2-component register.
17693 const SDValue Ops0[] = {
17694 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17695 buildSMovImm32(DAG, DL, 0),
17696 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17697 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17698 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17699
17700 SDValue SubRegHi = SDValue(
17701 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17702
17703 // Combine the constants and the pointer.
17704 const SDValue Ops1[] = {
17705 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17706 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17707 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17708
17709 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17710}
17711
17712/// Return a resource descriptor with the 'Add TID' bit enabled
17713/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17714/// of the resource descriptor) to create an offset, which is added to
17715/// the resource pointer.
17717 SDValue Ptr, uint32_t RsrcDword1,
17718 uint64_t RsrcDword2And3) const {
17719 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17720 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17721 if (RsrcDword1) {
17722 PtrHi =
17723 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17724 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17725 0);
17726 }
17727
17728 SDValue DataLo =
17729 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17730 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17731
17732 const SDValue Ops[] = {
17733 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17734 PtrLo,
17735 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17736 PtrHi,
17737 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17738 DataLo,
17739 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17740 DataHi,
17741 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17742
17743 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17744}
17745
17746//===----------------------------------------------------------------------===//
17747// SI Inline Assembly Support
17748//===----------------------------------------------------------------------===//
17749
17750std::pair<unsigned, const TargetRegisterClass *>
17752 StringRef Constraint,
17753 MVT VT) const {
17754 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17755
17756 const TargetRegisterClass *RC = nullptr;
17757 if (Constraint.size() == 1) {
17758 // Check if we cannot determine the bit size of the given value type. This
17759 // can happen, for example, in this situation where we have an empty struct
17760 // (size 0): `call void asm "", "v"({} poison)`-
17761 if (VT == MVT::Other)
17762 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17763 const unsigned BitWidth = VT.getSizeInBits();
17764 switch (Constraint[0]) {
17765 default:
17766 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17767 case 's':
17768 case 'r':
17769 switch (BitWidth) {
17770 case 16:
17771 RC = &AMDGPU::SReg_32RegClass;
17772 break;
17773 case 64:
17774 RC = &AMDGPU::SGPR_64RegClass;
17775 break;
17776 default:
17778 if (!RC)
17779 return std::pair(0U, nullptr);
17780 break;
17781 }
17782 break;
17783 case 'v':
17784 switch (BitWidth) {
17785 case 1:
17786 return std::pair(0U, nullptr);
17787 case 16:
17788 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17789 : &AMDGPU::VGPR_32_Lo256RegClass;
17790 break;
17791 default:
17792 RC = Subtarget->has1024AddressableVGPRs()
17793 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17794 : TRI->getVGPRClassForBitWidth(BitWidth);
17795 if (!RC)
17796 return std::pair(0U, nullptr);
17797 break;
17798 }
17799 break;
17800 case 'a':
17801 if (!Subtarget->hasMAIInsts())
17802 break;
17803 switch (BitWidth) {
17804 case 1:
17805 return std::pair(0U, nullptr);
17806 case 16:
17807 RC = &AMDGPU::AGPR_32RegClass;
17808 break;
17809 default:
17810 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17811 if (!RC)
17812 return std::pair(0U, nullptr);
17813 break;
17814 }
17815 break;
17816 }
17817 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17818 const unsigned BitWidth = VT.getSizeInBits();
17819 switch (BitWidth) {
17820 case 16:
17821 RC = &AMDGPU::AV_32RegClass;
17822 break;
17823 default:
17824 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17825 if (!RC)
17826 return std::pair(0U, nullptr);
17827 break;
17828 }
17829 }
17830
17831 // We actually support i128, i16 and f16 as inline parameters
17832 // even if they are not reported as legal
17833 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17834 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17835 return std::pair(0U, RC);
17836
17837 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17838 if (Kind != '\0') {
17839 if (Kind == 'v') {
17840 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17841 } else if (Kind == 's') {
17842 RC = &AMDGPU::SGPR_32RegClass;
17843 } else if (Kind == 'a') {
17844 RC = &AMDGPU::AGPR_32RegClass;
17845 }
17846
17847 if (RC) {
17848 if (NumRegs > 1) {
17849 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17850 return std::pair(0U, nullptr);
17851
17852 uint32_t Width = NumRegs * 32;
17853 // Prohibit constraints for register ranges with a width that does not
17854 // match the required type.
17855 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17856 return std::pair(0U, nullptr);
17857
17858 MCRegister Reg = RC->getRegister(Idx);
17860 RC = TRI->getVGPRClassForBitWidth(Width);
17861 else if (SIRegisterInfo::isSGPRClass(RC))
17862 RC = TRI->getSGPRClassForBitWidth(Width);
17863 else if (SIRegisterInfo::isAGPRClass(RC))
17864 RC = TRI->getAGPRClassForBitWidth(Width);
17865 if (RC) {
17866 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17867 if (!Reg) {
17868 // The register class does not contain the requested register,
17869 // e.g., because it is an SGPR pair that would violate alignment
17870 // requirements.
17871 return std::pair(0U, nullptr);
17872 }
17873 return std::pair(Reg, RC);
17874 }
17875 }
17876
17877 // Check for lossy scalar/vector conversions.
17878 if (VT.isVector() && VT.getSizeInBits() != 32)
17879 return std::pair(0U, nullptr);
17880 if (Idx < RC->getNumRegs())
17881 return std::pair(RC->getRegister(Idx), RC);
17882 return std::pair(0U, nullptr);
17883 }
17884 }
17885
17886 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17887 if (Ret.first)
17888 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17889
17890 return Ret;
17891}
17892
17893static bool isImmConstraint(StringRef Constraint) {
17894 if (Constraint.size() == 1) {
17895 switch (Constraint[0]) {
17896 default:
17897 break;
17898 case 'I':
17899 case 'J':
17900 case 'A':
17901 case 'B':
17902 case 'C':
17903 return true;
17904 }
17905 } else if (Constraint == "DA" || Constraint == "DB") {
17906 return true;
17907 }
17908 return false;
17909}
17910
17913 if (Constraint.size() == 1) {
17914 switch (Constraint[0]) {
17915 default:
17916 break;
17917 case 's':
17918 case 'v':
17919 case 'a':
17920 return C_RegisterClass;
17921 }
17922 } else if (Constraint.size() == 2) {
17923 if (Constraint == "VA")
17924 return C_RegisterClass;
17925 }
17926 if (isImmConstraint(Constraint)) {
17927 return C_Other;
17928 }
17929 return TargetLowering::getConstraintType(Constraint);
17930}
17931
17932static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17934 Val = Val & maskTrailingOnes<uint64_t>(Size);
17935 }
17936 return Val;
17937}
17938
17940 StringRef Constraint,
17941 std::vector<SDValue> &Ops,
17942 SelectionDAG &DAG) const {
17943 if (isImmConstraint(Constraint)) {
17944 uint64_t Val;
17945 if (getAsmOperandConstVal(Op, Val) &&
17946 checkAsmConstraintVal(Op, Constraint, Val)) {
17947 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17948 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17949 }
17950 } else {
17952 }
17953}
17954
17956 unsigned Size = Op.getScalarValueSizeInBits();
17957 if (Size > 64)
17958 return false;
17959
17960 if (Size == 16 && !Subtarget->has16BitInsts())
17961 return false;
17962
17964 Val = C->getSExtValue();
17965 return true;
17966 }
17968 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17969 return true;
17970 }
17972 if (Size != 16 || Op.getNumOperands() != 2)
17973 return false;
17974 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17975 return false;
17976 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17977 Val = C->getSExtValue();
17978 return true;
17979 }
17980 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17981 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17982 return true;
17983 }
17984 }
17985
17986 return false;
17987}
17988
17990 uint64_t Val) const {
17991 if (Constraint.size() == 1) {
17992 switch (Constraint[0]) {
17993 case 'I':
17995 case 'J':
17996 return isInt<16>(Val);
17997 case 'A':
17998 return checkAsmConstraintValA(Op, Val);
17999 case 'B':
18000 return isInt<32>(Val);
18001 case 'C':
18002 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
18004 default:
18005 break;
18006 }
18007 } else if (Constraint.size() == 2) {
18008 if (Constraint == "DA") {
18009 int64_t HiBits = static_cast<int32_t>(Val >> 32);
18010 int64_t LoBits = static_cast<int32_t>(Val);
18011 return checkAsmConstraintValA(Op, HiBits, 32) &&
18012 checkAsmConstraintValA(Op, LoBits, 32);
18013 }
18014 if (Constraint == "DB") {
18015 return true;
18016 }
18017 }
18018 llvm_unreachable("Invalid asm constraint");
18019}
18020
18022 unsigned MaxSize) const {
18023 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
18024 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18025 if (Size == 16) {
18026 MVT VT = Op.getSimpleValueType();
18027 switch (VT.SimpleTy) {
18028 default:
18029 return false;
18030 case MVT::i16:
18031 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
18032 case MVT::f16:
18033 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
18034 case MVT::bf16:
18035 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
18036 case MVT::v2i16:
18037 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
18038 case MVT::v2f16:
18039 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
18040 case MVT::v2bf16:
18041 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
18042 }
18043 }
18044 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
18045 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
18046 return true;
18047 return false;
18048}
18049
18050static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18051 switch (UnalignedClassID) {
18052 case AMDGPU::VReg_64RegClassID:
18053 return AMDGPU::VReg_64_Align2RegClassID;
18054 case AMDGPU::VReg_96RegClassID:
18055 return AMDGPU::VReg_96_Align2RegClassID;
18056 case AMDGPU::VReg_128RegClassID:
18057 return AMDGPU::VReg_128_Align2RegClassID;
18058 case AMDGPU::VReg_160RegClassID:
18059 return AMDGPU::VReg_160_Align2RegClassID;
18060 case AMDGPU::VReg_192RegClassID:
18061 return AMDGPU::VReg_192_Align2RegClassID;
18062 case AMDGPU::VReg_224RegClassID:
18063 return AMDGPU::VReg_224_Align2RegClassID;
18064 case AMDGPU::VReg_256RegClassID:
18065 return AMDGPU::VReg_256_Align2RegClassID;
18066 case AMDGPU::VReg_288RegClassID:
18067 return AMDGPU::VReg_288_Align2RegClassID;
18068 case AMDGPU::VReg_320RegClassID:
18069 return AMDGPU::VReg_320_Align2RegClassID;
18070 case AMDGPU::VReg_352RegClassID:
18071 return AMDGPU::VReg_352_Align2RegClassID;
18072 case AMDGPU::VReg_384RegClassID:
18073 return AMDGPU::VReg_384_Align2RegClassID;
18074 case AMDGPU::VReg_512RegClassID:
18075 return AMDGPU::VReg_512_Align2RegClassID;
18076 case AMDGPU::VReg_1024RegClassID:
18077 return AMDGPU::VReg_1024_Align2RegClassID;
18078 case AMDGPU::AReg_64RegClassID:
18079 return AMDGPU::AReg_64_Align2RegClassID;
18080 case AMDGPU::AReg_96RegClassID:
18081 return AMDGPU::AReg_96_Align2RegClassID;
18082 case AMDGPU::AReg_128RegClassID:
18083 return AMDGPU::AReg_128_Align2RegClassID;
18084 case AMDGPU::AReg_160RegClassID:
18085 return AMDGPU::AReg_160_Align2RegClassID;
18086 case AMDGPU::AReg_192RegClassID:
18087 return AMDGPU::AReg_192_Align2RegClassID;
18088 case AMDGPU::AReg_256RegClassID:
18089 return AMDGPU::AReg_256_Align2RegClassID;
18090 case AMDGPU::AReg_512RegClassID:
18091 return AMDGPU::AReg_512_Align2RegClassID;
18092 case AMDGPU::AReg_1024RegClassID:
18093 return AMDGPU::AReg_1024_Align2RegClassID;
18094 default:
18095 return -1;
18096 }
18097}
18098
18099// Figure out which registers should be reserved for stack access. Only after
18100// the function is legalized do we know all of the non-spill stack objects or if
18101// calls are present.
18105 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18106 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18107 const SIInstrInfo *TII = ST.getInstrInfo();
18108
18109 if (Info->isEntryFunction()) {
18110 // Callable functions have fixed registers used for stack access.
18112 }
18113
18114 // TODO: Move this logic to getReservedRegs()
18115 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18116 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18117 Register SReg = ST.isWave32()
18118 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18119 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18120 &AMDGPU::SGPR_64RegClass);
18121 Info->setSGPRForEXECCopy(SReg);
18122
18123 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18124 Info->getStackPtrOffsetReg()));
18125 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18126 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18127
18128 // We need to worry about replacing the default register with itself in case
18129 // of MIR testcases missing the MFI.
18130 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18131 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18132
18133 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18134 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18135
18136 Info->limitOccupancy(MF);
18137
18138 if (ST.isWave32() && !MF.empty()) {
18139 for (auto &MBB : MF) {
18140 for (auto &MI : MBB) {
18141 TII->fixImplicitOperands(MI);
18142 }
18143 }
18144 }
18145
18146 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18147 // classes if required. Ideally the register class constraints would differ
18148 // per-subtarget, but there's no easy way to achieve that right now. This is
18149 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18150 // from using them as the register class for legal types.
18151 if (ST.needsAlignedVGPRs()) {
18152 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18153 const Register Reg = Register::index2VirtReg(I);
18154 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18155 if (!RC)
18156 continue;
18157 int NewClassID = getAlignedAGPRClassID(RC->getID());
18158 if (NewClassID != -1)
18159 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18160 }
18161 }
18162
18164}
18165
18167 KnownBits &Known,
18168 const APInt &DemandedElts,
18169 const SelectionDAG &DAG,
18170 unsigned Depth) const {
18171 Known.resetAll();
18172 unsigned Opc = Op.getOpcode();
18173 switch (Opc) {
18175 unsigned IID = Op.getConstantOperandVal(0);
18176 switch (IID) {
18177 case Intrinsic::amdgcn_mbcnt_lo:
18178 case Intrinsic::amdgcn_mbcnt_hi: {
18179 const GCNSubtarget &ST =
18181 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18182 // most 31 + src1.
18183 Known.Zero.setBitsFrom(
18184 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18185 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18186 Known = KnownBits::add(Known, Known2);
18187 return;
18188 }
18189 }
18190 break;
18191 }
18192 }
18194 Op, Known, DemandedElts, DAG, Depth);
18195}
18196
18198 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18200
18201 // Set the high bits to zero based on the maximum allowed scratch size per
18202 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18203 // calculation won't overflow, so assume the sign bit is never set.
18204 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18205}
18206
18208 GISelValueTracking &VT, KnownBits &Known,
18209 unsigned Dim) {
18210 unsigned MaxValue =
18211 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18212 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18213}
18214
18216 KnownBits &Known, const APInt &DemandedElts,
18217 unsigned BFEWidth, bool SExt, unsigned Depth) {
18219 const MachineOperand &Src1 = MI.getOperand(2);
18220
18221 unsigned Src1Cst = 0;
18222 if (Src1.isImm()) {
18223 Src1Cst = Src1.getImm();
18224 } else if (Src1.isReg()) {
18225 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18226 if (!Cst)
18227 return;
18228 Src1Cst = Cst->Value.getZExtValue();
18229 } else {
18230 return;
18231 }
18232
18233 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18234 // Width is always [22:16].
18235 const unsigned Offset =
18236 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18237 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18238
18239 if (Width >= BFEWidth) // Ill-formed.
18240 return;
18241
18242 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18243 Depth + 1);
18244
18245 Known = Known.extractBits(Width, Offset);
18246
18247 if (SExt)
18248 Known = Known.sext(BFEWidth);
18249 else
18250 Known = Known.zext(BFEWidth);
18251}
18252
18254 GISelValueTracking &VT, Register R, KnownBits &Known,
18255 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18256 unsigned Depth) const {
18257 Known.resetAll();
18258 const MachineInstr *MI = MRI.getVRegDef(R);
18259 switch (MI->getOpcode()) {
18260 case AMDGPU::S_BFE_I32:
18261 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18262 /*SExt=*/true, Depth);
18263 case AMDGPU::S_BFE_U32:
18264 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18265 /*SExt=*/false, Depth);
18266 case AMDGPU::S_BFE_I64:
18267 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18268 /*SExt=*/true, Depth);
18269 case AMDGPU::S_BFE_U64:
18270 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18271 /*SExt=*/false, Depth);
18272 case AMDGPU::G_INTRINSIC:
18273 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18274 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18275 switch (IID) {
18276 case Intrinsic::amdgcn_workitem_id_x:
18277 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18278 break;
18279 case Intrinsic::amdgcn_workitem_id_y:
18280 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18281 break;
18282 case Intrinsic::amdgcn_workitem_id_z:
18283 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18284 break;
18285 case Intrinsic::amdgcn_mbcnt_lo:
18286 case Intrinsic::amdgcn_mbcnt_hi: {
18287 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18288 // most 31 + src1.
18289 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18290 ? getSubtarget()->getWavefrontSizeLog2()
18291 : 5);
18292 KnownBits Known2;
18293 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18294 Depth + 1);
18295 Known = KnownBits::add(Known, Known2);
18296 break;
18297 }
18298 case Intrinsic::amdgcn_groupstaticsize: {
18299 // We can report everything over the maximum size as 0. We can't report
18300 // based on the actual size because we don't know if it's accurate or not
18301 // at any given point.
18302 Known.Zero.setHighBits(
18303 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18304 break;
18305 }
18306 }
18307 break;
18308 }
18309 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18310 Known.Zero.setHighBits(24);
18311 break;
18312 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18313 Known.Zero.setHighBits(16);
18314 break;
18315 case AMDGPU::G_AMDGPU_SMED3:
18316 case AMDGPU::G_AMDGPU_UMED3: {
18317 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18318
18319 KnownBits Known2;
18320 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18321 if (Known2.isUnknown())
18322 break;
18323
18324 KnownBits Known1;
18325 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18326 if (Known1.isUnknown())
18327 break;
18328
18329 KnownBits Known0;
18330 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18331 if (Known0.isUnknown())
18332 break;
18333
18334 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18335 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18336 Known.One = Known0.One & Known1.One & Known2.One;
18337 break;
18338 }
18339 }
18340}
18341
18344 unsigned Depth) const {
18345 const MachineInstr *MI = MRI.getVRegDef(R);
18346 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18347 // FIXME: Can this move to generic code? What about the case where the call
18348 // site specifies a lower alignment?
18349 Intrinsic::ID IID = GI->getIntrinsicID();
18351 AttributeList Attrs =
18352 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18353 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18354 return *RetAlign;
18355 }
18356 return Align(1);
18357}
18358
18361 const Align CacheLineAlign = Align(64);
18362
18363 // Pre-GFX10 target did not benefit from loop alignment
18364 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18365 getSubtarget()->hasInstFwdPrefetchBug())
18366 return PrefAlign;
18367
18368 // On GFX10 I$ is 4 x 64 bytes cache lines.
18369 // By default prefetcher keeps one cache line behind and reads two ahead.
18370 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18371 // behind and one ahead.
18372 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18373 // If loop fits 64 bytes it always spans no more than two cache lines and
18374 // does not need an alignment.
18375 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18376 // Else if loop is less or equal 192 bytes we need two lines behind.
18377
18379 const MachineBasicBlock *Header = ML->getHeader();
18380 if (Header->getAlignment() != PrefAlign)
18381 return Header->getAlignment(); // Already processed.
18382
18383 unsigned LoopSize = 0;
18384 for (const MachineBasicBlock *MBB : ML->blocks()) {
18385 // If inner loop block is aligned assume in average half of the alignment
18386 // size to be added as nops.
18387 if (MBB != Header)
18388 LoopSize += MBB->getAlignment().value() / 2;
18389
18390 for (const MachineInstr &MI : *MBB) {
18391 LoopSize += TII->getInstSizeInBytes(MI);
18392 if (LoopSize > 192)
18393 return PrefAlign;
18394 }
18395 }
18396
18397 if (LoopSize <= 64)
18398 return PrefAlign;
18399
18400 if (LoopSize <= 128)
18401 return CacheLineAlign;
18402
18403 // If any of parent loops is surrounded by prefetch instructions do not
18404 // insert new for inner loop, which would reset parent's settings.
18405 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18406 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18407 auto I = Exit->getFirstNonDebugInstr();
18408 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18409 return CacheLineAlign;
18410 }
18411 }
18412
18413 MachineBasicBlock *Pre = ML->getLoopPreheader();
18414 MachineBasicBlock *Exit = ML->getExitBlock();
18415
18416 if (Pre && Exit) {
18417 auto PreTerm = Pre->getFirstTerminator();
18418 if (PreTerm == Pre->begin() ||
18419 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18420 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18421 .addImm(1); // prefetch 2 lines behind PC
18422
18423 auto ExitHead = Exit->getFirstNonDebugInstr();
18424 if (ExitHead == Exit->end() ||
18425 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18426 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18427 .addImm(2); // prefetch 1 line behind PC
18428 }
18429
18430 return CacheLineAlign;
18431}
18432
18433[[maybe_unused]]
18434static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18435 assert(N->getOpcode() == ISD::CopyFromReg);
18436 do {
18437 // Follow the chain until we find an INLINEASM node.
18438 N = N->getOperand(0).getNode();
18439 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18440 return true;
18441 } while (N->getOpcode() == ISD::CopyFromReg);
18442 return false;
18443}
18444
18447 UniformityInfo *UA) const {
18448 switch (N->getOpcode()) {
18449 case ISD::CopyFromReg: {
18450 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18451 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18452 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18453 Register Reg = R->getReg();
18454
18455 // FIXME: Why does this need to consider isLiveIn?
18456 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18457 return !TRI->isSGPRReg(MRI, Reg);
18458
18459 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18460 return UA->isDivergent(V);
18461
18463 return !TRI->isSGPRReg(MRI, Reg);
18464 }
18465 case ISD::LOAD: {
18466 const LoadSDNode *L = cast<LoadSDNode>(N);
18467 unsigned AS = L->getAddressSpace();
18468 // A flat load may access private memory.
18470 }
18471 case ISD::CALLSEQ_END:
18472 return true;
18474 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18476 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18477 case AMDGPUISD::ATOMIC_CMP_SWAP:
18478 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18479 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18480 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18481 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18482 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18483 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18484 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18485 case AMDGPUISD::BUFFER_ATOMIC_AND:
18486 case AMDGPUISD::BUFFER_ATOMIC_OR:
18487 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18488 case AMDGPUISD::BUFFER_ATOMIC_INC:
18489 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18490 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18491 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18492 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18493 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18494 // Target-specific read-modify-write atomics are sources of divergence.
18495 return true;
18496 default:
18497 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18498 // Generic read-modify-write atomics are sources of divergence.
18499 return A->readMem() && A->writeMem();
18500 }
18501 return false;
18502 }
18503}
18504
18506 EVT VT) const {
18507 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18508 case MVT::f32:
18510 case MVT::f64:
18511 case MVT::f16:
18513 default:
18514 return false;
18515 }
18516}
18517
18519 LLT Ty, const MachineFunction &MF) const {
18520 switch (Ty.getScalarSizeInBits()) {
18521 case 32:
18522 return !denormalModeIsFlushAllF32(MF);
18523 case 64:
18524 case 16:
18525 return !denormalModeIsFlushAllF64F16(MF);
18526 default:
18527 return false;
18528 }
18529}
18530
18532 const APInt &DemandedElts,
18533 const SelectionDAG &DAG,
18534 bool SNaN,
18535 unsigned Depth) const {
18536 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18537 const MachineFunction &MF = DAG.getMachineFunction();
18539
18540 if (Info->getMode().DX10Clamp)
18541 return true; // Clamped to 0.
18542 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18543 }
18544
18546 DAG, SNaN, Depth);
18547}
18548
18549// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18550// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18552 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18553 return true;
18554
18555 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
18556 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18557 if (DenormMode == DenormalMode::getPreserveSign())
18558 return true;
18559
18560 // TODO: Remove this.
18561 return RMW->getFunction()
18562 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18563 .getValueAsBool();
18564}
18565
18567 LLVMContext &Ctx = RMW->getContext();
18568 StringRef MemScope =
18569 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18570
18571 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18572 << "Hardware instruction generated for atomic "
18573 << RMW->getOperationName(RMW->getOperation())
18574 << " operation at memory scope " << MemScope;
18575}
18576
18577static bool isV2F16OrV2BF16(Type *Ty) {
18578 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18579 Type *EltTy = VT->getElementType();
18580 return VT->getNumElements() == 2 &&
18581 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18582 }
18583
18584 return false;
18585}
18586
18587static bool isV2F16(Type *Ty) {
18589 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18590}
18591
18592static bool isV2BF16(Type *Ty) {
18594 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18595}
18596
18597/// \return true if atomicrmw integer ops work for the type.
18598static bool isAtomicRMWLegalIntTy(Type *Ty) {
18599 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18600 unsigned BW = IT->getBitWidth();
18601 return BW == 32 || BW == 64;
18602 }
18603
18604 return false;
18605}
18606
18607/// \return true if this atomicrmw xchg type can be selected.
18608static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18609 Type *Ty = RMW->getType();
18610 if (isAtomicRMWLegalIntTy(Ty))
18611 return true;
18612
18613 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18614 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18615 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18616 return BW == 32 || BW == 64;
18617 }
18618
18619 if (Ty->isFloatTy() || Ty->isDoubleTy())
18620 return true;
18621
18623 return VT->getNumElements() == 2 &&
18624 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18625 }
18626
18627 return false;
18628}
18629
18630/// \returns true if it's valid to emit a native instruction for \p RMW, based
18631/// on the properties of the target memory.
18632static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18633 const AtomicRMWInst *RMW,
18634 bool HasSystemScope) {
18635 // The remote/fine-grained access logic is different from the integer
18636 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18637 // fine-grained access does not work, even for a device local allocation.
18638 //
18639 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18640 // allocations work.
18641 if (HasSystemScope) {
18643 RMW->hasMetadata("amdgpu.no.remote.memory"))
18644 return true;
18645 if (Subtarget.hasEmulatedSystemScopeAtomics())
18646 return true;
18648 return true;
18649
18650 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18651}
18652
18653/// \return Action to perform on AtomicRMWInsts for integer operations.
18660
18661/// Return if a flat address space atomicrmw can access private memory.
18663 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18664 return !MD ||
18666}
18667
18675
18678 unsigned AS = RMW->getPointerAddressSpace();
18679 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18681
18682 // 64-bit flat atomics that dynamically reside in private memory will silently
18683 // be dropped.
18684 //
18685 // Note that we will emit a new copy of the original atomic in the expansion,
18686 // which will be incrementally relegalized.
18687 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18688 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18689 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18692
18693 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18695 ORE.emit([=]() {
18696 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18697 });
18698 return Kind;
18699 };
18700
18701 auto SSID = RMW->getSyncScopeID();
18702 bool HasSystemScope =
18703 SSID == SyncScope::System ||
18704 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18705
18706 auto Op = RMW->getOperation();
18707 switch (Op) {
18709 // PCIe supports add and xchg for system atomics.
18710 return isAtomicRMWLegalXChgTy(RMW)
18713 case AtomicRMWInst::Add:
18714 // PCIe supports add and xchg for system atomics.
18716 case AtomicRMWInst::Sub:
18717 case AtomicRMWInst::And:
18718 case AtomicRMWInst::Or:
18719 case AtomicRMWInst::Xor:
18720 case AtomicRMWInst::Max:
18721 case AtomicRMWInst::Min:
18728 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
18730 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
18733 auto *IT = dyn_cast<IntegerType>(RMW->getType());
18734 if (!IT || IT->getBitWidth() != 32)
18736 }
18737
18740 if (Subtarget->hasEmulatedSystemScopeAtomics())
18742
18743 // On most subtargets, for atomicrmw operations other than add/xchg,
18744 // whether or not the instructions will behave correctly depends on where
18745 // the address physically resides and what interconnect is used in the
18746 // system configuration. On some some targets the instruction will nop,
18747 // and in others synchronization will only occur at degraded device scope.
18748 //
18749 // If the allocation is known local to the device, the instructions should
18750 // work correctly.
18751 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18753
18754 // If fine-grained remote memory works at device scope, we don't need to
18755 // do anything.
18756 if (!HasSystemScope &&
18757 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18759
18760 // If we are targeting a remote allocated address, it depends what kind of
18761 // allocation the address belongs to.
18762 //
18763 // If the allocation is fine-grained (in host memory, or in PCIe peer
18764 // device memory), the operation will fail depending on the target.
18765 //
18766 // Note fine-grained host memory access does work on APUs or if XGMI is
18767 // used, but we do not know if we are targeting an APU or the system
18768 // configuration from the ISA version/target-cpu.
18769 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18771
18774 // Atomic sub/or/xor do not work over PCI express, but atomic add
18775 // does. InstCombine transforms these with 0 to or, so undo that.
18776 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18777 ConstVal && ConstVal->isNullValue())
18779 }
18780
18781 // If the allocation could be in remote, fine-grained memory, the rmw
18782 // instructions may fail. cmpxchg should work, so emit that. On some
18783 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18784 // even work, so you're out of luck anyway.
18785
18786 // In summary:
18787 //
18788 // Cases that may fail:
18789 // - fine-grained pinned host memory
18790 // - fine-grained migratable host memory
18791 // - fine-grained PCIe peer device
18792 //
18793 // Cases that should work, but may be treated overly conservatively.
18794 // - fine-grained host memory on an APU
18795 // - fine-grained XGMI peer device
18797 }
18798
18800 }
18801 case AtomicRMWInst::FAdd: {
18802 Type *Ty = RMW->getType();
18803
18804 // TODO: Handle REGION_ADDRESS
18805 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18806 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18807 // is fixed to round-to-nearest-even.
18808 //
18809 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18810 // round-to-nearest-even.
18811 //
18812 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18813 // suggests it is OK if the floating-point mode may not match the calling
18814 // thread.
18815 if (Ty->isFloatTy()) {
18816 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18818 }
18819
18820 if (Ty->isDoubleTy()) {
18821 // Ignores denormal mode, but we don't consider flushing mandatory.
18822 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18824 }
18825
18826 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18828
18830 }
18831
18832 // LDS atomics respect the denormal mode from the mode register.
18833 //
18834 // Traditionally f32 global/buffer memory atomics would unconditionally
18835 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18836 // flush.
18837 //
18838 // On targets with flat atomic fadd, denormals would flush depending on
18839 // whether the target address resides in LDS or global memory. We consider
18840 // this flat-maybe-flush as will-flush.
18841 if (Ty->isFloatTy() &&
18842 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18845
18846 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18847 // safe. The message phrasing also should be better.
18848 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18849 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18850 // gfx942, gfx12
18851 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18852 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18853 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18854 // gfx90a, gfx942, gfx12
18855 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18856 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18857
18858 // gfx942, gfx12
18859 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18860 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18861 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18862 // gfx90a, gfx942, gfx12
18863 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18864 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18865
18866 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18867 // buffer. gfx12 does have the buffer version.
18868 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18869 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18870 }
18871
18872 // global and flat atomic fadd f64: gfx90a, gfx942.
18873 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18874 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18875
18876 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18877 if (Ty->isFloatTy()) {
18878 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18879 // gfx11+.
18880 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18881 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18882 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18883 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18884 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18885 } else {
18886 // gfx908
18887 if (RMW->use_empty() &&
18888 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18889 isV2F16(Ty))
18890 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18891 }
18892 }
18893
18894 // flat atomic fadd f32: gfx942, gfx11+.
18895 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18896 if (Subtarget->hasFlatAtomicFaddF32Inst())
18897 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18898
18899 // If it is in flat address space, and the type is float, we will try to
18900 // expand it, if the target supports global and lds atomic fadd. The
18901 // reason we need that is, in the expansion, we emit the check of
18902 // address space. If it is in global address space, we emit the global
18903 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18904 // fadd.
18905 if (Subtarget->hasLDSFPAtomicAddF32()) {
18906 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18908 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18910 }
18911 }
18912 }
18913
18915 }
18917 case AtomicRMWInst::FMax: {
18918 Type *Ty = RMW->getType();
18919
18920 // LDS float and double fmin/fmax were always supported.
18921 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18922 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18924 }
18925
18926 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18927 // For flat and global cases:
18928 // float, double in gfx7. Manual claims denormal support.
18929 // Removed in gfx8.
18930 // float, double restored in gfx10.
18931 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18932 //
18933 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18934 // no f32.
18935 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18936 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18937 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18938 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18939 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18940 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18942 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18943 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18944 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18945 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18946 }
18947 }
18948
18950 }
18953 default:
18955 }
18956
18957 llvm_unreachable("covered atomicrmw op switch");
18958}
18959
18966
18973
18976 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18977 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18979
18980 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18982
18983 const DataLayout &DL = CmpX->getDataLayout();
18984
18985 Type *ValTy = CmpX->getNewValOperand()->getType();
18986
18987 // If a 64-bit flat atomic may alias private, we need to avoid using the
18988 // atomic in the private case.
18989 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18991}
18992
18993const TargetRegisterClass *
18994SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18996 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18997 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18998 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18999 : &AMDGPU::SReg_32RegClass;
19000 if (!TRI->isSGPRClass(RC) && !isDivergent)
19001 return TRI->getEquivalentSGPRClass(RC);
19002 if (TRI->isSGPRClass(RC) && isDivergent) {
19003 if (Subtarget->hasGFX90AInsts())
19004 return TRI->getEquivalentAVClass(RC);
19005 return TRI->getEquivalentVGPRClass(RC);
19006 }
19007
19008 return RC;
19009}
19010
19011// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19012// uniform values (as produced by the mask results of control flow intrinsics)
19013// used outside of divergent blocks. The phi users need to also be treated as
19014// always uniform.
19015//
19016// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19017static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
19018 unsigned WaveSize) {
19019 // FIXME: We assume we never cast the mask results of a control flow
19020 // intrinsic.
19021 // Early exit if the type won't be consistent as a compile time hack.
19022 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
19023 if (!IT || IT->getBitWidth() != WaveSize)
19024 return false;
19025
19026 if (!isa<Instruction>(V))
19027 return false;
19028 if (!Visited.insert(V).second)
19029 return false;
19030 bool Result = false;
19031 for (const auto *U : V->users()) {
19033 if (V == U->getOperand(1)) {
19034 switch (Intrinsic->getIntrinsicID()) {
19035 default:
19036 Result = false;
19037 break;
19038 case Intrinsic::amdgcn_if_break:
19039 case Intrinsic::amdgcn_if:
19040 case Intrinsic::amdgcn_else:
19041 Result = true;
19042 break;
19043 }
19044 }
19045 if (V == U->getOperand(0)) {
19046 switch (Intrinsic->getIntrinsicID()) {
19047 default:
19048 Result = false;
19049 break;
19050 case Intrinsic::amdgcn_end_cf:
19051 case Intrinsic::amdgcn_loop:
19052 Result = true;
19053 break;
19054 }
19055 }
19056 } else {
19057 Result = hasCFUser(U, Visited, WaveSize);
19058 }
19059 if (Result)
19060 break;
19061 }
19062 return Result;
19063}
19064
19066 const Value *V) const {
19067 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
19068 if (CI->isInlineAsm()) {
19069 // FIXME: This cannot give a correct answer. This should only trigger in
19070 // the case where inline asm returns mixed SGPR and VGPR results, used
19071 // outside the defining block. We don't have a specific result to
19072 // consider, so this assumes if any value is SGPR, the overall register
19073 // also needs to be SGPR.
19074 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19076 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
19077 for (auto &TC : TargetConstraints) {
19078 if (TC.Type == InlineAsm::isOutput) {
19080 const TargetRegisterClass *RC =
19081 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
19082 TC.ConstraintVT)
19083 .second;
19084 if (RC && SIRI->isSGPRClass(RC))
19085 return true;
19086 }
19087 }
19088 }
19089 }
19091 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19092}
19093
19095 for (SDUse &Use : N->uses()) {
19097 if (getBasePtrIndex(M) == Use.getOperandNo())
19098 return true;
19099 }
19100 }
19101 return false;
19102}
19103
19105 SDValue N1) const {
19106 if (!N0.hasOneUse())
19107 return false;
19108 // Take care of the opportunity to keep N0 uniform
19109 if (N0->isDivergent() || !N1->isDivergent())
19110 return true;
19111 // Check if we have a good chance to form the memory access pattern with the
19112 // base and offset
19113 return (DAG.isBaseWithConstantOffset(N0) &&
19115}
19116
19118 Register N0, Register N1) const {
19119 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
19120}
19121
19124 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19126 if (I.getMetadata("amdgpu.noclobber"))
19127 Flags |= MONoClobber;
19128 if (I.getMetadata("amdgpu.last.use"))
19129 Flags |= MOLastUse;
19130 return Flags;
19131}
19132
19134 Instruction *AI) const {
19135 // Given: atomicrmw fadd ptr %addr, float %val ordering
19136 //
19137 // With this expansion we produce the following code:
19138 // [...]
19139 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19140 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19141 //
19142 // atomicrmw.shared:
19143 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19144 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19145 // float %val ordering
19146 // br label %atomicrmw.phi
19147 //
19148 // atomicrmw.check.private:
19149 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19150 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19151 //
19152 // atomicrmw.private:
19153 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19154 // %loaded.private = load float, ptr addrspace(5) %cast.private
19155 // %val.new = fadd float %loaded.private, %val
19156 // store float %val.new, ptr addrspace(5) %cast.private
19157 // br label %atomicrmw.phi
19158 //
19159 // atomicrmw.global:
19160 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19161 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19162 // float %val ordering
19163 // br label %atomicrmw.phi
19164 //
19165 // atomicrmw.phi:
19166 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19167 // [ %loaded.private, %atomicrmw.private ],
19168 // [ %loaded.global, %atomicrmw.global ]
19169 // br label %atomicrmw.end
19170 //
19171 // atomicrmw.end:
19172 // [...]
19173 //
19174 //
19175 // For 64-bit atomics which may reside in private memory, we perform a simpler
19176 // version that only inserts the private check, and uses the flat operation.
19177
19178 IRBuilder<> Builder(AI);
19179 LLVMContext &Ctx = Builder.getContext();
19180
19181 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19182 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19184 Value *Addr = AI->getOperand(PtrOpIdx);
19185
19186 /// TODO: Only need to check private, then emit flat-known-not private (no
19187 /// need for shared block, or cast to global).
19189
19190 Align Alignment;
19191 if (RMW)
19192 Alignment = RMW->getAlign();
19193 else if (CX)
19194 Alignment = CX->getAlign();
19195 else
19196 llvm_unreachable("unhandled atomic operation");
19197
19198 // FullFlatEmulation is true if we need to issue the private, shared, and
19199 // global cases.
19200 //
19201 // If this is false, we are only dealing with the flat-targeting-private case,
19202 // where we only insert a check for private and still use the flat instruction
19203 // for global and shared.
19204
19205 bool FullFlatEmulation =
19206 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19207 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19208 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19209 RMW->getType()->isDoubleTy()));
19210
19211 // If the return value isn't used, do not introduce a false use in the phi.
19212 bool ReturnValueIsUsed = !AI->use_empty();
19213
19214 BasicBlock *BB = Builder.GetInsertBlock();
19215 Function *F = BB->getParent();
19216 BasicBlock *ExitBB =
19217 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19218 BasicBlock *SharedBB = nullptr;
19219
19220 BasicBlock *CheckPrivateBB = BB;
19221 if (FullFlatEmulation) {
19222 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19223 CheckPrivateBB =
19224 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19225 }
19226
19227 BasicBlock *PrivateBB =
19228 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19229 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19230 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19231
19232 std::prev(BB->end())->eraseFromParent();
19233 Builder.SetInsertPoint(BB);
19234
19235 Value *LoadedShared = nullptr;
19236 if (FullFlatEmulation) {
19237 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19238 {Addr}, nullptr, "is.shared");
19239 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19240 Builder.SetInsertPoint(SharedBB);
19241 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19243
19244 Instruction *Clone = AI->clone();
19245 Clone->insertInto(SharedBB, SharedBB->end());
19246 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19247 LoadedShared = Clone;
19248
19249 Builder.CreateBr(PhiBB);
19250 Builder.SetInsertPoint(CheckPrivateBB);
19251 }
19252
19253 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19254 {Addr}, nullptr, "is.private");
19255 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19256
19257 Builder.SetInsertPoint(PrivateBB);
19258
19259 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19261
19262 Value *LoadedPrivate;
19263 if (RMW) {
19264 LoadedPrivate = Builder.CreateAlignedLoad(
19265 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19266
19267 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19268 LoadedPrivate, RMW->getValOperand());
19269
19270 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19271 } else {
19272 auto [ResultLoad, Equal] =
19273 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19274 CX->getNewValOperand(), CX->getAlign());
19275
19276 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19277 ResultLoad, 0);
19278 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19279 }
19280
19281 Builder.CreateBr(PhiBB);
19282
19283 Builder.SetInsertPoint(GlobalBB);
19284
19285 // Continue using a flat instruction if we only emitted the check for private.
19286 Instruction *LoadedGlobal = AI;
19287 if (FullFlatEmulation) {
19288 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19290 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19291 }
19292
19293 AI->removeFromParent();
19294 AI->insertInto(GlobalBB, GlobalBB->end());
19295
19296 // The new atomicrmw may go through another round of legalization later.
19297 if (!FullFlatEmulation) {
19298 // We inserted the runtime check already, make sure we do not try to
19299 // re-expand this.
19300 // TODO: Should union with any existing metadata.
19301 MDBuilder MDB(F->getContext());
19302 MDNode *RangeNotPrivate =
19305 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19306 RangeNotPrivate);
19307 }
19308
19309 Builder.CreateBr(PhiBB);
19310
19311 Builder.SetInsertPoint(PhiBB);
19312
19313 if (ReturnValueIsUsed) {
19314 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19315 AI->replaceAllUsesWith(Loaded);
19316 if (FullFlatEmulation)
19317 Loaded->addIncoming(LoadedShared, SharedBB);
19318 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19319 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19320 Loaded->takeName(AI);
19321 }
19322
19323 Builder.CreateBr(ExitBB);
19324}
19325
19327 unsigned PtrOpIdx) {
19328 Value *PtrOp = I->getOperand(PtrOpIdx);
19331
19332 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19333 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19334 I->getIterator());
19335 I->setOperand(PtrOpIdx, ASCast);
19336}
19337
19340
19343
19346 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19347 ConstVal && ConstVal->isNullValue()) {
19348 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19350
19351 // We may still need the private-alias-flat handling below.
19352
19353 // TODO: Skip this for cases where we cannot access remote memory.
19354 }
19355 }
19356
19357 // The non-flat expansions should only perform the de-canonicalization of
19358 // identity values.
19360 return;
19361
19363}
19364
19371
19375
19377 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19378}
19379
19381 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19382 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19383
19385 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19386}
19387
19388LoadInst *
19390 IRBuilder<> Builder(AI);
19391 auto Order = AI->getOrdering();
19392
19393 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19394 // must be flushed if the atomic ordering had a release semantics. This is
19395 // not necessary a fence, a release fence just coincides to do that flush.
19396 // Avoid replacing of an atomicrmw with a release semantics.
19397 if (isReleaseOrStronger(Order))
19398 return nullptr;
19399
19400 LoadInst *LI = Builder.CreateAlignedLoad(
19401 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19402 LI->setAtomic(Order, AI->getSyncScopeID());
19403 LI->copyMetadata(*AI);
19404 LI->takeName(AI);
19405 AI->replaceAllUsesWith(LI);
19406 AI->eraseFromParent();
19407 return LI;
19408}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1258
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1255
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1102
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6052
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
bool isNegative() const
Definition APFloat.h:1431
bool isNormal() const
Definition APFloat.h:1435
APInt bitcastToAPInt() const
Definition APFloat.h:1335
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1120
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1061
bool isInfinity() const
Definition APFloat.h:1428
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1400
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1394
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1648
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:215
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:806
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool isWave64() const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2794
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:226
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:223
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:426
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:246
Value * getOperand(unsigned i) const
Definition User.h:233
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1106
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:153
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:818
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:787
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:778
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:852
@ ATOMIC_LOAD_USUB_COND
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ GlobalAddress
Definition ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:879
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:746
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:992
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ ATOMIC_LOAD_USUB_SAT
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:974
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:843
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:664
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:786
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:795
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:969
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:703
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:764
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:849
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:810
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:887
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:726
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:977
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:804
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:925
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:738
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:958
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:996
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:855
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:832
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:237
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:203
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:175
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:228
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:183
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:350
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:251
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs