LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
42#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
46#include "llvm/IR/MDBuilder.h"
49#include "llvm/Support/ModRef.h"
51#include <optional>
52
53using namespace llvm;
54using namespace llvm::SDPatternMatch;
55
56#define DEBUG_TYPE "si-lower"
57
58STATISTIC(NumTailCalls, "Number of tail calls");
59
60static cl::opt<bool>
61 DisableLoopAlignment("amdgpu-disable-loop-alignment",
62 cl::desc("Do not align and prefetch loops"),
63 cl::init(false));
64
66 "amdgpu-use-divergent-register-indexing", cl::Hidden,
67 cl::desc("Use indirect register addressing for divergent indexes"),
68 cl::init(false));
69
72 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
73}
74
77 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
78}
79
80static unsigned findFirstFreeSGPR(CCState &CCInfo) {
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
83 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
84 return AMDGPU::SGPR0 + Reg;
85 }
86 }
87 llvm_unreachable("Cannot allocate sgpr");
88}
89
91 const GCNSubtarget &STI)
92 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
93 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
94 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
95
96 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V32RegClass =
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
101 addRegisterClass(MVT::f32, V32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const TargetRegisterClass *V64RegClass =
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32,
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32,
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
149
150 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
151 addRegisterClass(MVT::v12f32,
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
153
154 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v16f32,
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
157
158 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
159 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
160
161 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
162 addRegisterClass(MVT::v16f64,
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
167 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
170 } else {
171 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
174 }
175
176 // Unless there are also VOP3P operations, not operations are really legal.
177 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
180 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
183 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
186 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
189 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
191 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
192 }
193
194 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
195 addRegisterClass(MVT::v32f32,
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
197
198 computeRegisterProperties(Subtarget->getRegisterInfo());
199
200 // The boolean content concept here is too inflexible. Compares only ever
201 // really produce a 1-bit result. Any copy/extend from these will turn into a
202 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
203 // it's what most targets use.
206
207 // We need to custom lower vector stores from local memory
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
213 Custom);
214
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
220 Custom);
221
222 if (isTypeLegal(MVT::bf16)) {
223 for (unsigned Opc :
232 ISD::SETCC}) {
233 // FIXME: The promoted to type shouldn't need to be explicit
234 setOperationAction(Opc, MVT::bf16, Promote);
235 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
236 }
237
239
241 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
242
246
247 // We only need to custom lower because we can't specify an action for bf16
248 // sources.
251 }
252
253 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
254 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
259 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
264 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
265 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
266 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
267 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
268 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
269
270 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
271 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
272 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
274 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
275 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
276 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
277
278 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
279
283 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
284
285 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
286
288 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
289
291 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
292 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
293
295 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
296 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
297 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
298 Expand);
300 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
301 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
302 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
303 Expand);
304
306 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
307 MVT::v3i16, MVT::v4i16, MVT::Other},
308 Custom);
309
312 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
313
315
317
319 Expand);
320
321#if 0
323#endif
324
325 // We only support LOAD/STORE and vector manipulation ops for vectors
326 // with > 4 elements.
327 for (MVT VT :
328 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
329 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
330 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
331 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
332 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
333 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
334 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
335 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
336 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
337 switch (Op) {
338 case ISD::LOAD:
339 case ISD::STORE:
341 case ISD::BITCAST:
342 case ISD::UNDEF:
346 case ISD::IS_FPCLASS:
347 break;
352 break;
353 default:
355 break;
356 }
357 }
358 }
359
361
362 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
363 // is expanded to avoid having two separate loops in case the index is a VGPR.
364
365 // Most operations are naturally 32-bit vector operations. We only support
366 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
367 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
369 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
370
372 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
373
375 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
376
378 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
379 }
380
381 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
383 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
384
386 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
387
389 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
390
392 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
393 }
394
395 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
397 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
398
400 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
401
403 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
404
406 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
407 }
408
409 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
411 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
412
414 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
415
417 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
418
420 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
421 }
422
423 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
425 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
426
428 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
429
431 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
432
434 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
435 }
436
438 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
439 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
440 Custom);
441
442 if (Subtarget->hasPkMovB32()) {
443 // TODO: 16-bit element vectors should be legal with even aligned elements.
444 // TODO: Can be legal with wider source types than the result with
445 // subregister extracts.
446 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
447 }
448
450 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
451 // instead lower to cndmask in SITargetLowering::LowerSELECT().
453 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
454 // alignbit.
455 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
456
457 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
458 Custom);
459
460 // Avoid stack access for these.
461 // TODO: Generalize to more vector types.
463 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
464 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
465 Custom);
466
467 // Deal with vec3 vector operations when widened to vec4.
469 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
470
471 // Deal with vec5/6/7 vector operations when widened to vec8.
473 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
474 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
475 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
476 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
477 Custom);
478
479 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
480 // and output demarshalling
481 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
482
483 // We can't return success/failure, only the old value,
484 // let LLVM add the comparison
486 Expand);
487
488 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
489
490 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
491
492 // FIXME: This should be narrowed to i32, but that only happens if i64 is
493 // illegal.
494 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
495 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
496
497 // On SI this is s_memtime and s_memrealtime on VI.
499
500 if (Subtarget->hasSMemRealTime() ||
501 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
504
505 if (Subtarget->has16BitInsts()) {
508 } else {
510 }
511
512 if (Subtarget->hasMadMacF32Insts())
514
517
518 // We only really have 32-bit BFE instructions (and 16-bit on VI).
519 //
520 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
521 // effort to match them now. We want this to be false for i64 cases when the
522 // extraction isn't restricted to the upper or lower half. Ideally we would
523 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
524 // span the midpoint are probably relatively rare, so don't worry about them
525 // for now.
527
528 // Clamp modifier on add/sub
529 if (Subtarget->hasIntClamp())
531
532 if (Subtarget->hasAddNoCarry())
533 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
534 Legal);
535
538 {MVT::f32, MVT::f64}, Custom);
539
540 // These are really only legal for ieee_mode functions. We should be avoiding
541 // them for functions that don't have ieee_mode enabled, so just say they are
542 // legal.
544 {MVT::f32, MVT::f64}, Legal);
545
546 if (Subtarget->haveRoundOpsF64())
548 Legal);
549 else
551 MVT::f64, Custom);
552
554 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
555 Legal);
556 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
557
560
561 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
562 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563
564 // Custom lower these because we can't specify a rule based on an illegal
565 // source bf16.
568
569 if (Subtarget->has16BitInsts()) {
572 MVT::i16, Legal);
573
574 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
575
577 MVT::i16, Expand);
578
582 ISD::CTPOP},
583 MVT::i16, Promote);
584
586
587 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
588
590 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
592 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
593
597
599
600 // F16 - Constant Actions.
603
604 // F16 - Load/Store Actions.
606 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
608 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
609
610 // BF16 - Load/Store Actions.
612 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
614 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
615
616 // F16 - VOP1 Actions.
619 MVT::f16, Custom);
620
621 // BF16 - VOP1 Actions.
622 if (Subtarget->hasBF16TransInsts())
624
627
628 // F16 - VOP2 Actions.
629 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
630 Expand);
634
635 // F16 - VOP3 Actions.
637 if (STI.hasMadF16())
639
640 for (MVT VT :
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
644 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
645 switch (Op) {
646 case ISD::LOAD:
647 case ISD::STORE:
649 case ISD::BITCAST:
650 case ISD::UNDEF:
655 case ISD::IS_FPCLASS:
656 break;
660 break;
661 default:
663 break;
664 }
665 }
666 }
667
668 // v_perm_b32 can handle either of these.
669 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
671
672 // XXX - Do these do anything? Vector constants turn into build_vector.
673 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
674
675 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
676 Legal);
677
679 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
681 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
682
684 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
686 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
687
688 setOperationAction(ISD::AND, MVT::v2i16, Promote);
689 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
690 setOperationAction(ISD::OR, MVT::v2i16, Promote);
691 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
692 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
694
696 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
698 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
699 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
700 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
701
703 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
705 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
707 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
708
710 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
712 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
713 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
715
717 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
719 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
720
722 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
724 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
726 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
727
728 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
730 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
731 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
732 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
733 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
734
736 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
738 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
739 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
740 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
741
742 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
744 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
746 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
748
750 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
752 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
753 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
754 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
755
757 MVT::v2i32, Expand);
759
761 MVT::v4i32, Expand);
762
764 MVT::v8i32, Expand);
765
766 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
767 Subtarget->hasVOP3PInsts() ? Legal : Custom);
768
769 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
770 // This isn't really legal, but this avoids the legalizer unrolling it (and
771 // allows matching fneg (fabs x) patterns)
772 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
773
774 // Can do this in one BFI plus a constant materialize.
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
779 Custom);
780
783 MVT::f16, Custom);
785
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
789 Custom);
790
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
793 Expand);
794
795 for (MVT Vec16 :
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
800 Vec16, Custom);
802 }
803 }
804
805 if (Subtarget->hasVOP3PInsts()) {
809 MVT::v2i16, Legal);
810
813 MVT::v2f16, Legal);
814
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
817
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
822 Custom);
823
824 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
825 // Split vector operations.
830 VT, Custom);
831
832 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
833 // Split vector operations.
835 VT, Custom);
836
839 {MVT::v2f16, MVT::v4f16}, Custom);
840
841 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
842 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
843 Custom);
844
845 if (Subtarget->hasBF16PackedInsts()) {
846 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
847 // Split vector operations.
849 VT, Custom);
850 }
851
852 if (Subtarget->hasPackedFP32Ops()) {
854 MVT::v2f32, Legal);
856 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
857 Custom);
858 }
859 }
860
862
863 if (Subtarget->has16BitInsts()) {
865 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
867 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
868 } else {
869 // Legalization hack.
870 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
871
873 }
874
876 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
877 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
878 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
879 MVT::v32f16, MVT::v32bf16},
880 Custom);
881
883
884 if (Subtarget->hasVectorMulU64())
886 else if (Subtarget->hasScalarSMulU64())
888
889 if (Subtarget->hasMad64_32())
891
892 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
894
895 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
897 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
898 } else {
899 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
900 if (Subtarget->hasMinimum3Maximum3F32())
902
903 if (Subtarget->hasMinimum3Maximum3PKF16()) {
905
906 // If only the vector form is available, we need to widen to a vector.
907 if (!Subtarget->hasMinimum3Maximum3F16())
909 }
910 }
911
912 if (Subtarget->hasVOP3PInsts()) {
913 // We want to break these into v2f16 pieces, not scalarize.
915 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
916 Custom);
917 }
918
919 if (Subtarget->hasIntMinMax64())
921 Legal);
922
924 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
925 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
926 MVT::i8},
927 Custom);
928
930 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
931 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
932 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
933 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
934 Custom);
935
937 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
938 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
939 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
940 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
941 Custom);
942
948
949 // TODO: Could move this to custom lowering, could benefit from combines on
950 // extract of relevant bits.
952
954
955 if (Subtarget->hasBF16ConversionInsts()) {
956 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
958 }
959
960 if (Subtarget->hasBF16PackedInsts()) {
963 MVT::v2bf16, Legal);
964 }
965
966 if (Subtarget->hasBF16TransInsts()) {
968 }
969
970 if (Subtarget->hasCvtPkF16F32Inst()) {
972 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
973 Custom);
974 }
975
979 ISD::SUB,
981 ISD::MUL,
982 ISD::FADD,
983 ISD::FSUB,
984 ISD::FDIV,
985 ISD::FMUL,
994 ISD::FMA,
995 ISD::SMIN,
996 ISD::SMAX,
997 ISD::UMIN,
998 ISD::UMAX,
1001 ISD::SMIN,
1002 ISD::SMAX,
1003 ISD::UMIN,
1004 ISD::UMAX,
1005 ISD::AND,
1006 ISD::OR,
1007 ISD::XOR,
1008 ISD::SHL,
1009 ISD::SRL,
1010 ISD::SRA,
1011 ISD::FSHR,
1021
1022 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1024
1025 // All memory operations. Some folding on the pointer operand is done to help
1026 // matching the constant offsets in the addressing modes.
1028 ISD::STORE,
1053
1054 // FIXME: In other contexts we pretend this is a per-function property.
1056
1058}
1059
1060const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1061
1063 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1064 return RCRegs;
1065}
1066
1067//===----------------------------------------------------------------------===//
1068// TargetLowering queries
1069//===----------------------------------------------------------------------===//
1070
1071// v_mad_mix* support a conversion from f16 to f32.
1072//
1073// There is only one special case when denormals are enabled we don't currently,
1074// where this is OK to use.
1075bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1076 EVT DestVT, EVT SrcVT) const {
1077 return DestVT.getScalarType() == MVT::f32 &&
1078 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1079 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 SrcVT.getScalarType() == MVT::f16) ||
1081 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1082 SrcVT.getScalarType() == MVT::bf16)) &&
1083 // TODO: This probably only requires no input flushing?
1085}
1086
1088 LLT DestTy, LLT SrcTy) const {
1089 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1090 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 DestTy.getScalarSizeInBits() == 32 &&
1092 SrcTy.getScalarSizeInBits() == 16 &&
1093 // TODO: This probably only requires no input flushing?
1094 denormalModeIsFlushAllF32(*MI.getMF());
1095}
1096
1098 // SI has some legal vector types, but no legal vector operations. Say no
1099 // shuffles are legal in order to prefer scalarizing some vector operations.
1100 return false;
1101}
1102
1104 CallingConv::ID CC,
1105 EVT VT) const {
1107 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1108
1109 if (VT.isVector()) {
1110 EVT ScalarVT = VT.getScalarType();
1111 unsigned Size = ScalarVT.getSizeInBits();
1112 if (Size == 16) {
1113 if (Subtarget->has16BitInsts()) {
1114 if (VT.isInteger())
1115 return MVT::v2i16;
1116 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1117 }
1118 return VT.isInteger() ? MVT::i32 : MVT::f32;
1119 }
1120
1121 if (Size < 16)
1122 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1123 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1124 }
1125
1126 if (VT.getSizeInBits() > 32)
1127 return MVT::i32;
1128
1129 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1130}
1131
1133 CallingConv::ID CC,
1134 EVT VT) const {
1136 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1137
1138 if (VT.isVector()) {
1139 unsigned NumElts = VT.getVectorNumElements();
1140 EVT ScalarVT = VT.getScalarType();
1141 unsigned Size = ScalarVT.getSizeInBits();
1142
1143 // FIXME: Should probably promote 8-bit vectors to i16.
1144 if (Size == 16 && Subtarget->has16BitInsts())
1145 return (NumElts + 1) / 2;
1146
1147 if (Size <= 32)
1148 return NumElts;
1149
1150 if (Size > 32)
1151 return NumElts * ((Size + 31) / 32);
1152 } else if (VT.getSizeInBits() > 32)
1153 return (VT.getSizeInBits() + 31) / 32;
1154
1155 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1156}
1157
1159 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1160 unsigned &NumIntermediates, MVT &RegisterVT) const {
1161 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1162 unsigned NumElts = VT.getVectorNumElements();
1163 EVT ScalarVT = VT.getScalarType();
1164 unsigned Size = ScalarVT.getSizeInBits();
1165 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1166 // support, but unless we can properly handle 3-vectors, it will be still be
1167 // inconsistent.
1168 if (Size == 16 && Subtarget->has16BitInsts()) {
1169 if (ScalarVT == MVT::bf16) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = MVT::v2bf16;
1172 } else {
1173 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1174 IntermediateVT = RegisterVT;
1175 }
1176 NumIntermediates = (NumElts + 1) / 2;
1177 return NumIntermediates;
1178 }
1179
1180 if (Size == 32) {
1181 RegisterVT = ScalarVT.getSimpleVT();
1182 IntermediateVT = RegisterVT;
1183 NumIntermediates = NumElts;
1184 return NumIntermediates;
1185 }
1186
1187 if (Size < 16 && Subtarget->has16BitInsts()) {
1188 // FIXME: Should probably form v2i16 pieces
1189 RegisterVT = MVT::i16;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1193 }
1194
1195 if (Size != 16 && Size <= 32) {
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1200 }
1201
1202 if (Size > 32) {
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = RegisterVT;
1205 NumIntermediates = NumElts * ((Size + 31) / 32);
1206 return NumIntermediates;
1207 }
1208 }
1209
1211 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1212}
1213
1215 const DataLayout &DL, Type *Ty,
1216 unsigned MaxNumLanes) {
1217 assert(MaxNumLanes != 0);
1218
1219 LLVMContext &Ctx = Ty->getContext();
1220 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1221 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1222 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1223 NumElts);
1224 }
1225
1226 return TLI.getValueType(DL, Ty);
1227}
1228
1229// Peek through TFE struct returns to only use the data size.
1231 const DataLayout &DL, Type *Ty,
1232 unsigned MaxNumLanes) {
1233 auto *ST = dyn_cast<StructType>(Ty);
1234 if (!ST)
1235 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1236
1237 // TFE intrinsics return an aggregate type.
1238 assert(ST->getNumContainedTypes() == 2 &&
1239 ST->getContainedType(1)->isIntegerTy(32));
1240 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1241}
1242
1243/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1244/// in-memory representation. This return value is a custom type because there
1245/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1246/// could cause issues during codegen, these address space 7 pointers will be
1247/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1248/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1249/// for cost modeling, to work. (This also sets us up decently for doing the
1250/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1252 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1253 return MVT::amdgpuBufferFatPointer;
1255 DL.getPointerSizeInBits(AS) == 192)
1256 return MVT::amdgpuBufferStridedPointer;
1258}
1259/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1260/// v8i32 when padding is added.
1261/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1262/// also v8i32 with padding.
1264 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1265 DL.getPointerSizeInBits(AS) == 160) ||
1267 DL.getPointerSizeInBits(AS) == 192))
1268 return MVT::v8i32;
1270}
1271
1272static unsigned getIntrMemWidth(unsigned IntrID) {
1273 switch (IntrID) {
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1276 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1277 return 8;
1278 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1280 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1281 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1282 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1283 return 32;
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1287 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1288 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1289 return 64;
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1295 return 128;
1296 default:
1297 llvm_unreachable("Unknown width");
1298 }
1299}
1300
1301static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad,
1303 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1304 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1305 switch (AtomicOrderingCABI(Ord)) {
1308 break;
1311 break;
1314 break;
1315 default:
1317 break;
1318 }
1319
1320 Info.flags =
1322 Info.flags |= MOCooperative;
1323
1324 MDNode *ScopeMD = cast<MDNode>(
1325 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1326 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1327 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1328}
1329
1331 const CallBase &CI,
1332 MachineFunction &MF,
1333 unsigned IntrID) const {
1334 Info.flags = MachineMemOperand::MONone;
1335 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1336 Info.flags |= MachineMemOperand::MOInvariant;
1337 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1339 Info.flags |= getTargetMMOFlags(CI);
1340
1341 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1343 AttributeSet Attr =
1345 MemoryEffects ME = Attr.getMemoryEffects();
1346 if (ME.doesNotAccessMemory())
1347 return false;
1348
1349 // TODO: Should images get their own address space?
1350 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1351
1352 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1353 if (RsrcIntr->IsImage) {
1354 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1356 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1357 Info.align.reset();
1358 }
1359
1360 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1361 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1362 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1363 // We conservatively set the memory operand of a buffer intrinsic to the
1364 // base resource pointer, so that we can access alias information about
1365 // those pointers. Cases like "this points at the same value
1366 // but with a different offset" are handled in
1367 // areMemAccessesTriviallyDisjoint.
1368 Info.ptrVal = RsrcArg;
1369 }
1370
1371 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1372 if (!IsSPrefetch) {
1373 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1374 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1375 Info.flags |= MachineMemOperand::MOVolatile;
1376 }
1377
1379 if (ME.onlyReadsMemory()) {
1380 if (RsrcIntr->IsImage) {
1381 unsigned MaxNumLanes = 4;
1382
1383 if (!BaseOpcode->Gather4) {
1384 // If this isn't a gather, we may have excess loaded elements in the
1385 // IR type. Check the dmask for the real number of elements loaded.
1386 unsigned DMask =
1387 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1388 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1389 }
1390
1391 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1392 CI.getType(), MaxNumLanes);
1393 } else {
1394 Info.memVT =
1396 std::numeric_limits<unsigned>::max());
1397 }
1398
1399 // FIXME: What does alignment mean for an image?
1400 Info.opc = ISD::INTRINSIC_W_CHAIN;
1401 Info.flags |= MachineMemOperand::MOLoad;
1402 } else if (ME.onlyWritesMemory()) {
1403 Info.opc = ISD::INTRINSIC_VOID;
1404
1405 Type *DataTy = CI.getArgOperand(0)->getType();
1406 if (RsrcIntr->IsImage) {
1407 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1408 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1409 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1410 DMaskLanes);
1411 } else
1412 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1413
1414 Info.flags |= MachineMemOperand::MOStore;
1415 } else {
1416 // Atomic, NoReturn Sampler or prefetch
1417 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1419 Info.flags |=
1421
1422 if (!IsSPrefetch)
1423 Info.flags |= MachineMemOperand::MOStore;
1424
1425 switch (IntrID) {
1426 default:
1427 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1428 // Fake memory access type for no return sampler intrinsics
1429 Info.memVT = MVT::i32;
1430 } else {
1431 // XXX - Should this be volatile without known ordering?
1432 Info.flags |= MachineMemOperand::MOVolatile;
1433 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1434 }
1435 break;
1436 case Intrinsic::amdgcn_raw_buffer_load_lds:
1437 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_buffer_load_lds:
1439 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1441 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1442 Info.ptrVal = CI.getArgOperand(1);
1443 return true;
1444 }
1445 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1446 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1449 Info.memVT =
1451 std::numeric_limits<unsigned>::max());
1452 Info.flags &= ~MachineMemOperand::MOStore;
1453 return true;
1454 }
1455 }
1456 }
1457 return true;
1458 }
1459
1460 switch (IntrID) {
1461 case Intrinsic::amdgcn_ds_ordered_add:
1462 case Intrinsic::amdgcn_ds_ordered_swap: {
1463 Info.opc = ISD::INTRINSIC_W_CHAIN;
1464 Info.memVT = MVT::getVT(CI.getType());
1465 Info.ptrVal = CI.getOperand(0);
1466 Info.align.reset();
1468
1469 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1470 if (!Vol->isZero())
1471 Info.flags |= MachineMemOperand::MOVolatile;
1472
1473 return true;
1474 }
1475 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1476 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1479 Info.ptrVal = nullptr;
1480 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1482 return true;
1483 }
1484 case Intrinsic::amdgcn_ds_append:
1485 case Intrinsic::amdgcn_ds_consume: {
1486 Info.opc = ISD::INTRINSIC_W_CHAIN;
1487 Info.memVT = MVT::getVT(CI.getType());
1488 Info.ptrVal = CI.getOperand(0);
1489 Info.align.reset();
1491
1492 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1493 if (!Vol->isZero())
1494 Info.flags |= MachineMemOperand::MOVolatile;
1495
1496 return true;
1497 }
1498 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1499 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1500 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1503 Info.memVT = MVT::getVT(CI.getType());
1504 Info.ptrVal = CI.getOperand(0);
1505 Info.memVT = MVT::i64;
1506 Info.size = 8;
1507 Info.align.reset();
1509 return true;
1510 }
1511 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1512 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1513 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1514 Info.opc = ISD::INTRINSIC_W_CHAIN;
1515 Info.memVT =
1516 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1517 ? CI.getType()
1519 ->getElementType(0)); // XXX: what is correct VT?
1520
1521 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1522 Info.align.reset();
1523 Info.flags |=
1525 return true;
1526 }
1527 case Intrinsic::amdgcn_global_atomic_fmin_num:
1528 case Intrinsic::amdgcn_global_atomic_fmax_num:
1529 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1530 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1531 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1532 Info.opc = ISD::INTRINSIC_W_CHAIN;
1533 Info.memVT = MVT::getVT(CI.getType());
1534 Info.ptrVal = CI.getOperand(0);
1535 Info.align.reset();
1539 return true;
1540 }
1541 case Intrinsic::amdgcn_flat_load_monitor_b32:
1542 case Intrinsic::amdgcn_flat_load_monitor_b64:
1543 case Intrinsic::amdgcn_flat_load_monitor_b128:
1544 case Intrinsic::amdgcn_global_load_monitor_b32:
1545 case Intrinsic::amdgcn_global_load_monitor_b64:
1546 case Intrinsic::amdgcn_global_load_monitor_b128:
1547 case Intrinsic::amdgcn_cluster_load_b32:
1548 case Intrinsic::amdgcn_cluster_load_b64:
1549 case Intrinsic::amdgcn_cluster_load_b128:
1550 case Intrinsic::amdgcn_ds_load_tr6_b96:
1551 case Intrinsic::amdgcn_ds_load_tr4_b64:
1552 case Intrinsic::amdgcn_ds_load_tr8_b64:
1553 case Intrinsic::amdgcn_ds_load_tr16_b128:
1554 case Intrinsic::amdgcn_global_load_tr6_b96:
1555 case Intrinsic::amdgcn_global_load_tr4_b64:
1556 case Intrinsic::amdgcn_global_load_tr_b64:
1557 case Intrinsic::amdgcn_global_load_tr_b128:
1558 case Intrinsic::amdgcn_ds_read_tr4_b64:
1559 case Intrinsic::amdgcn_ds_read_tr6_b96:
1560 case Intrinsic::amdgcn_ds_read_tr8_b64:
1561 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1562 Info.opc = ISD::INTRINSIC_W_CHAIN;
1563 Info.memVT = MVT::getVT(CI.getType());
1564 Info.ptrVal = CI.getOperand(0);
1565 Info.align.reset();
1566 Info.flags |= MachineMemOperand::MOLoad;
1567 return true;
1568 }
1569 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1570 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1571 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1572 Info.opc = ISD::INTRINSIC_W_CHAIN;
1573 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1574 Info.ptrVal = CI.getOperand(0);
1575 Info.align.reset();
1576 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1577 return true;
1578 }
1579 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1580 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1581 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1582 Info.opc = ISD::INTRINSIC_VOID;
1583 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1584 Info.ptrVal = CI.getArgOperand(0);
1585 Info.align.reset();
1586 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1587 return true;
1588 }
1589 case Intrinsic::amdgcn_ds_gws_init:
1590 case Intrinsic::amdgcn_ds_gws_barrier:
1591 case Intrinsic::amdgcn_ds_gws_sema_v:
1592 case Intrinsic::amdgcn_ds_gws_sema_br:
1593 case Intrinsic::amdgcn_ds_gws_sema_p:
1594 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1595 Info.opc = ISD::INTRINSIC_VOID;
1596
1597 const GCNTargetMachine &TM =
1598 static_cast<const GCNTargetMachine &>(getTargetMachine());
1599
1601 Info.ptrVal = MFI->getGWSPSV(TM);
1602
1603 // This is an abstract access, but we need to specify a type and size.
1604 Info.memVT = MVT::i32;
1605 Info.size = 4;
1606 Info.align = Align(4);
1607
1608 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1609 Info.flags |= MachineMemOperand::MOLoad;
1610 else
1611 Info.flags |= MachineMemOperand::MOStore;
1612 return true;
1613 }
1614 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1615 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1616 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1617 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1618 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1619 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1620 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1621 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1622 Info.opc = ISD::INTRINSIC_VOID;
1623 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1624 Info.ptrVal = CI.getArgOperand(1);
1626 return true;
1627 }
1628 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1629 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1630 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1631 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1632 Info.opc = ISD::INTRINSIC_VOID;
1633 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1634 Info.ptrVal = CI.getArgOperand(0);
1636 return true;
1637 }
1638 case Intrinsic::amdgcn_load_to_lds:
1639 case Intrinsic::amdgcn_global_load_lds: {
1640 Info.opc = ISD::INTRINSIC_VOID;
1641 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1642 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1643 Info.ptrVal = CI.getArgOperand(1);
1645 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1646 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1647 Info.flags |= MachineMemOperand::MOVolatile;
1648 return true;
1649 }
1650 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1651 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1652 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1653 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1654 Info.opc = ISD::INTRINSIC_W_CHAIN;
1655
1656 const GCNTargetMachine &TM =
1657 static_cast<const GCNTargetMachine &>(getTargetMachine());
1658
1660 Info.ptrVal = MFI->getGWSPSV(TM);
1661
1662 // This is an abstract access, but we need to specify a type and size.
1663 Info.memVT = MVT::i32;
1664 Info.size = 4;
1665 Info.align = Align(4);
1666
1668 return true;
1669 }
1670 case Intrinsic::amdgcn_s_prefetch_data:
1671 case Intrinsic::amdgcn_flat_prefetch:
1672 case Intrinsic::amdgcn_global_prefetch: {
1673 Info.opc = ISD::INTRINSIC_VOID;
1674 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1675 Info.ptrVal = CI.getArgOperand(0);
1676 Info.flags |= MachineMemOperand::MOLoad;
1677 return true;
1678 }
1679 default:
1680 return false;
1681 }
1682}
1683
1685 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1687 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1688 // The DAG's ValueType loses the addrspaces.
1689 // Add them as 2 extra Constant operands "from" and "to".
1690 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1691 unsigned DstAS = I.getType()->getPointerAddressSpace();
1692 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1693 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1694 break;
1695 }
1696 default:
1697 break;
1698 }
1699}
1700
1703 Type *&AccessTy) const {
1704 Value *Ptr = nullptr;
1705 switch (II->getIntrinsicID()) {
1706 case Intrinsic::amdgcn_cluster_load_b128:
1707 case Intrinsic::amdgcn_cluster_load_b64:
1708 case Intrinsic::amdgcn_cluster_load_b32:
1709 case Intrinsic::amdgcn_ds_append:
1710 case Intrinsic::amdgcn_ds_consume:
1711 case Intrinsic::amdgcn_ds_load_tr8_b64:
1712 case Intrinsic::amdgcn_ds_load_tr16_b128:
1713 case Intrinsic::amdgcn_ds_load_tr4_b64:
1714 case Intrinsic::amdgcn_ds_load_tr6_b96:
1715 case Intrinsic::amdgcn_ds_read_tr4_b64:
1716 case Intrinsic::amdgcn_ds_read_tr6_b96:
1717 case Intrinsic::amdgcn_ds_read_tr8_b64:
1718 case Intrinsic::amdgcn_ds_read_tr16_b64:
1719 case Intrinsic::amdgcn_ds_ordered_add:
1720 case Intrinsic::amdgcn_ds_ordered_swap:
1721 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1722 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1723 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1724 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1725 case Intrinsic::amdgcn_flat_load_monitor_b128:
1726 case Intrinsic::amdgcn_flat_load_monitor_b32:
1727 case Intrinsic::amdgcn_flat_load_monitor_b64:
1728 case Intrinsic::amdgcn_global_atomic_fmax_num:
1729 case Intrinsic::amdgcn_global_atomic_fmin_num:
1730 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1731 case Intrinsic::amdgcn_global_load_monitor_b128:
1732 case Intrinsic::amdgcn_global_load_monitor_b32:
1733 case Intrinsic::amdgcn_global_load_monitor_b64:
1734 case Intrinsic::amdgcn_global_load_tr_b64:
1735 case Intrinsic::amdgcn_global_load_tr_b128:
1736 case Intrinsic::amdgcn_global_load_tr4_b64:
1737 case Intrinsic::amdgcn_global_load_tr6_b96:
1738 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1739 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1740 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1741 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1742 Ptr = II->getArgOperand(0);
1743 break;
1744 case Intrinsic::amdgcn_load_to_lds:
1745 case Intrinsic::amdgcn_global_load_lds:
1746 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1747 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1748 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1749 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1750 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1751 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1752 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1753 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1754 Ptr = II->getArgOperand(1);
1755 break;
1756 default:
1757 return false;
1758 }
1759 AccessTy = II->getType();
1760 Ops.push_back(Ptr);
1761 return true;
1762}
1763
1765 unsigned AddrSpace) const {
1766 if (!Subtarget->hasFlatInstOffsets()) {
1767 // Flat instructions do not have offsets, and only have the register
1768 // address.
1769 return AM.BaseOffs == 0 && AM.Scale == 0;
1770 }
1771
1772 decltype(SIInstrFlags::FLAT) FlatVariant =
1776
1777 return AM.Scale == 0 &&
1778 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1779 AM.BaseOffs, AddrSpace, FlatVariant));
1780}
1781
1783 if (Subtarget->hasFlatGlobalInsts())
1785
1786 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1787 // Assume the we will use FLAT for all global memory accesses
1788 // on VI.
1789 // FIXME: This assumption is currently wrong. On VI we still use
1790 // MUBUF instructions for the r + i addressing mode. As currently
1791 // implemented, the MUBUF instructions only work on buffer < 4GB.
1792 // It may be possible to support > 4GB buffers with MUBUF instructions,
1793 // by setting the stride value in the resource descriptor which would
1794 // increase the size limit to (stride * 4GB). However, this is risky,
1795 // because it has never been validated.
1797 }
1798
1799 return isLegalMUBUFAddressingMode(AM);
1800}
1801
1802bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1803 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1804 // additionally can do r + r + i with addr64. 32-bit has more addressing
1805 // mode options. Depending on the resource constant, it can also do
1806 // (i64 r0) + (i32 r1) * (i14 i).
1807 //
1808 // Private arrays end up using a scratch buffer most of the time, so also
1809 // assume those use MUBUF instructions. Scratch loads / stores are currently
1810 // implemented as mubuf instructions with offen bit set, so slightly
1811 // different than the normal addr64.
1812 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1813 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1814 return false;
1815
1816 // FIXME: Since we can split immediate into soffset and immediate offset,
1817 // would it make sense to allow any immediate?
1818
1819 switch (AM.Scale) {
1820 case 0: // r + i or just i, depending on HasBaseReg.
1821 return true;
1822 case 1:
1823 return true; // We have r + r or r + i.
1824 case 2:
1825 if (AM.HasBaseReg) {
1826 // Reject 2 * r + r.
1827 return false;
1828 }
1829
1830 // Allow 2 * r as r + r
1831 // Or 2 * r + i is allowed as r + r + i.
1832 return true;
1833 default: // Don't allow n * r
1834 return false;
1835 }
1836}
1837
1839 const AddrMode &AM, Type *Ty,
1840 unsigned AS,
1841 Instruction *I) const {
1842 // No global is ever allowed as a base.
1843 if (AM.BaseGV)
1844 return false;
1845
1846 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1847 return isLegalGlobalAddressingMode(AM);
1848
1849 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1853 // If the offset isn't a multiple of 4, it probably isn't going to be
1854 // correctly aligned.
1855 // FIXME: Can we get the real alignment here?
1856 if (AM.BaseOffs % 4 != 0)
1857 return isLegalMUBUFAddressingMode(AM);
1858
1859 if (!Subtarget->hasScalarSubwordLoads()) {
1860 // There are no SMRD extloads, so if we have to do a small type access we
1861 // will use a MUBUF load.
1862 // FIXME?: We also need to do this if unaligned, but we don't know the
1863 // alignment here.
1864 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1865 return isLegalGlobalAddressingMode(AM);
1866 }
1867
1868 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1869 // SMRD instructions have an 8-bit, dword offset on SI.
1870 if (!isUInt<8>(AM.BaseOffs / 4))
1871 return false;
1872 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1873 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1874 // in 8-bits, it can use a smaller encoding.
1875 if (!isUInt<32>(AM.BaseOffs / 4))
1876 return false;
1877 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1878 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1879 if (!isUInt<20>(AM.BaseOffs))
1880 return false;
1881 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1882 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1883 // for S_BUFFER_* instructions).
1884 if (!isInt<21>(AM.BaseOffs))
1885 return false;
1886 } else {
1887 // On GFX12, all offsets are signed 24-bit in bytes.
1888 if (!isInt<24>(AM.BaseOffs))
1889 return false;
1890 }
1891
1892 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1894 AM.BaseOffs < 0) {
1895 // Scalar (non-buffer) loads can only use a negative offset if
1896 // soffset+offset is non-negative. Since the compiler can only prove that
1897 // in a few special cases, it is safer to claim that negative offsets are
1898 // not supported.
1899 return false;
1900 }
1901
1902 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1903 return true;
1904
1905 if (AM.Scale == 1 && AM.HasBaseReg)
1906 return true;
1907
1908 return false;
1909 }
1910
1911 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1912 return Subtarget->enableFlatScratch()
1914 : isLegalMUBUFAddressingMode(AM);
1915
1916 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1917 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1918 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1919 // field.
1920 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1921 // an 8-bit dword offset but we don't know the alignment here.
1922 if (!isUInt<16>(AM.BaseOffs))
1923 return false;
1924
1925 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1926 return true;
1927
1928 if (AM.Scale == 1 && AM.HasBaseReg)
1929 return true;
1930
1931 return false;
1932 }
1933
1935 // For an unknown address space, this usually means that this is for some
1936 // reason being used for pure arithmetic, and not based on some addressing
1937 // computation. We don't have instructions that compute pointers with any
1938 // addressing modes, so treat them as having no offset like flat
1939 // instructions.
1941 }
1942
1943 // Assume a user alias of global for unknown address spaces.
1944 return isLegalGlobalAddressingMode(AM);
1945}
1946
1948 const MachineFunction &MF) const {
1950 return (MemVT.getSizeInBits() <= 4 * 32);
1951 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1952 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1953 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1954 }
1956 return (MemVT.getSizeInBits() <= 2 * 32);
1957 return true;
1958}
1959
1961 unsigned Size, unsigned AddrSpace, Align Alignment,
1962 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1963 if (IsFast)
1964 *IsFast = 0;
1965
1966 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1967 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1968 // Check if alignment requirements for ds_read/write instructions are
1969 // disabled.
1970 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1971 return false;
1972
1973 Align RequiredAlignment(
1974 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1975 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1976 Alignment < RequiredAlignment)
1977 return false;
1978
1979 // Either, the alignment requirements are "enabled", or there is an
1980 // unaligned LDS access related hardware bug though alignment requirements
1981 // are "disabled". In either case, we need to check for proper alignment
1982 // requirements.
1983 //
1984 switch (Size) {
1985 case 64:
1986 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1987 // address is negative, then the instruction is incorrectly treated as
1988 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1989 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1990 // load later in the SILoadStoreOptimizer.
1991 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1992 return false;
1993
1994 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1995 // can do a 4 byte aligned, 8 byte access in a single operation using
1996 // ds_read2/write2_b32 with adjacent offsets.
1997 RequiredAlignment = Align(4);
1998
1999 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2000 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2001 // ds_write2_b32 depending on the alignment. In either case with either
2002 // alignment there is no faster way of doing this.
2003
2004 // The numbers returned here and below are not additive, it is a 'speed
2005 // rank'. They are just meant to be compared to decide if a certain way
2006 // of lowering an operation is faster than another. For that purpose
2007 // naturally aligned operation gets it bitsize to indicate that "it
2008 // operates with a speed comparable to N-bit wide load". With the full
2009 // alignment ds128 is slower than ds96 for example. If underaligned it
2010 // is comparable to a speed of a single dword access, which would then
2011 // mean 32 < 128 and it is faster to issue a wide load regardless.
2012 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2013 // wider load which will not be aligned anymore the latter is slower.
2014 if (IsFast)
2015 *IsFast = (Alignment >= RequiredAlignment) ? 64
2016 : (Alignment < Align(4)) ? 32
2017 : 1;
2018 return true;
2019 }
2020
2021 break;
2022 case 96:
2023 if (!Subtarget->hasDS96AndDS128())
2024 return false;
2025
2026 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2027 // gfx8 and older.
2028
2029 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2030 // Naturally aligned access is fastest. However, also report it is Fast
2031 // if memory is aligned less than DWORD. A narrow load or store will be
2032 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2033 // be more of them, so overall we will pay less penalty issuing a single
2034 // instruction.
2035
2036 // See comment on the values above.
2037 if (IsFast)
2038 *IsFast = (Alignment >= RequiredAlignment) ? 96
2039 : (Alignment < Align(4)) ? 32
2040 : 1;
2041 return true;
2042 }
2043
2044 break;
2045 case 128:
2046 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2047 return false;
2048
2049 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2050 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2051 // single operation using ds_read2/write2_b64.
2052 RequiredAlignment = Align(8);
2053
2054 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2055 // Naturally aligned access is fastest. However, also report it is Fast
2056 // if memory is aligned less than DWORD. A narrow load or store will be
2057 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2058 // will be more of them, so overall we will pay less penalty issuing a
2059 // single instruction.
2060
2061 // See comment on the values above.
2062 if (IsFast)
2063 *IsFast = (Alignment >= RequiredAlignment) ? 128
2064 : (Alignment < Align(4)) ? 32
2065 : 1;
2066 return true;
2067 }
2068
2069 break;
2070 default:
2071 if (Size > 32)
2072 return false;
2073
2074 break;
2075 }
2076
2077 // See comment on the values above.
2078 // Note that we have a single-dword or sub-dword here, so if underaligned
2079 // it is a slowest possible access, hence returned value is 0.
2080 if (IsFast)
2081 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2082
2083 return Alignment >= RequiredAlignment ||
2084 Subtarget->hasUnalignedDSAccessEnabled();
2085 }
2086
2087 // FIXME: We have to be conservative here and assume that flat operations
2088 // will access scratch. If we had access to the IR function, then we
2089 // could determine if any private memory was used in the function.
2090 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2091 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2092 bool AlignedBy4 = Alignment >= Align(4);
2093 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2094 if (IsFast)
2095 *IsFast = AlignedBy4 ? Size : 1;
2096 return true;
2097 }
2098
2099 if (IsFast)
2100 *IsFast = AlignedBy4;
2101
2102 return AlignedBy4;
2103 }
2104
2105 // So long as they are correct, wide global memory operations perform better
2106 // than multiple smaller memory ops -- even when misaligned
2107 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2108 if (IsFast)
2109 *IsFast = Size;
2110
2111 return Alignment >= Align(4) ||
2112 Subtarget->hasUnalignedBufferAccessEnabled();
2113 }
2114
2115 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2116 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2117 // out-of-bounds behavior, but in the edge case where an access starts
2118 // out-of-bounds and then enter in-bounds, the entire access would be treated
2119 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2120 // natural alignment of buffer accesses.
2121 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2122 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2123 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2124 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2125 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2126 return false;
2127 }
2128
2129 // Smaller than dword value must be aligned.
2130 if (Size < 32)
2131 return false;
2132
2133 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2134 // byte-address are ignored, thus forcing Dword alignment.
2135 // This applies to private, global, and constant memory.
2136 if (IsFast)
2137 *IsFast = 1;
2138
2139 return Size >= 32 && Alignment >= Align(4);
2140}
2141
2143 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2144 unsigned *IsFast) const {
2146 Alignment, Flags, IsFast);
2147}
2148
2150 LLVMContext &Context, const MemOp &Op,
2151 const AttributeList &FuncAttributes) const {
2152 // FIXME: Should account for address space here.
2153
2154 // The default fallback uses the private pointer size as a guess for a type to
2155 // use. Make sure we switch these to 64-bit accesses.
2156
2157 if (Op.size() >= 16 &&
2158 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2159 return MVT::v4i32;
2160
2161 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2162 return MVT::v2i32;
2163
2164 // Use the default.
2165 return MVT::Other;
2166}
2167
2169 const MemSDNode *MemNode = cast<MemSDNode>(N);
2170 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2171}
2172
2177
2179 unsigned DestAS) const {
2180 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2181 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2182 Subtarget->hasGloballyAddressableScratch()) {
2183 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2184 return false;
2185 }
2186
2187 // Flat -> private/local is a simple truncate.
2188 // Flat -> global is no-op
2189 return true;
2190 }
2191
2192 const GCNTargetMachine &TM =
2193 static_cast<const GCNTargetMachine &>(getTargetMachine());
2194 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2195}
2196
2204
2206 Type *Ty) const {
2207 // FIXME: Could be smarter if called for vector constants.
2208 return true;
2209}
2210
2212 unsigned Index) const {
2214 return false;
2215
2216 // TODO: Add more cases that are cheap.
2217 return Index == 0;
2218}
2219
2220bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2221 // TODO: This should be more aggressive, particular for 16-bit element
2222 // vectors. However there are some mixed improvements and regressions.
2223 EVT EltTy = VT.getVectorElementType();
2224 return EltTy.getSizeInBits() % 32 == 0;
2225}
2226
2228 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2229 switch (Op) {
2230 case ISD::LOAD:
2231 case ISD::STORE:
2232 return true;
2233 default:
2234 return false;
2235 }
2236 }
2237
2238 // SimplifySetCC uses this function to determine whether or not it should
2239 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2240 if (VT == MVT::i1 && Op == ISD::SETCC)
2241 return false;
2242
2244}
2245
2248 // This isn't really a constant pool but close enough.
2251 return PtrInfo;
2252}
2253
2254SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2255 const SDLoc &SL,
2256 SDValue Chain,
2257 uint64_t Offset) const {
2258 const DataLayout &DL = DAG.getDataLayout();
2262
2263 auto [InputPtrReg, RC, ArgTy] =
2264 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2265
2266 // We may not have the kernarg segment argument if we have no kernel
2267 // arguments.
2268 if (!InputPtrReg)
2269 return DAG.getConstant(Offset, SL, PtrVT);
2270
2272 SDValue BasePtr = DAG.getCopyFromReg(
2273 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2274
2275 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2276}
2277
2278SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2279 const SDLoc &SL) const {
2282 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2283}
2284
2285SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2286 const SDLoc &SL) const {
2287
2289 std::optional<uint32_t> KnownSize =
2291 if (KnownSize.has_value())
2292 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2293 return SDValue();
2294}
2295
2296SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2297 const SDLoc &SL, SDValue Val,
2298 bool Signed,
2299 const ISD::InputArg *Arg) const {
2300 // First, if it is a widened vector, narrow it.
2301 if (VT.isVector() &&
2303 EVT NarrowedVT =
2306 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2307 DAG.getConstant(0, SL, MVT::i32));
2308 }
2309
2310 // Then convert the vector elements or scalar value.
2311 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2312 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2313 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2314 }
2315
2316 if (MemVT.isFloatingPoint())
2317 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2318 else if (Signed)
2319 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2320 else
2321 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2322
2323 return Val;
2324}
2325
2326SDValue SITargetLowering::lowerKernargMemParameter(
2327 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2328 uint64_t Offset, Align Alignment, bool Signed,
2329 const ISD::InputArg *Arg) const {
2330
2331 MachinePointerInfo PtrInfo =
2333
2334 // Try to avoid using an extload by loading earlier than the argument address,
2335 // and extracting the relevant bits. The load should hopefully be merged with
2336 // the previous argument.
2337 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2338 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2339 int64_t AlignDownOffset = alignDown(Offset, 4);
2340 int64_t OffsetDiff = Offset - AlignDownOffset;
2341
2342 EVT IntVT = MemVT.changeTypeToInteger();
2343
2344 // TODO: If we passed in the base kernel offset we could have a better
2345 // alignment than 4, but we don't really need it.
2346 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2347 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2348 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2351
2352 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2353 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2354
2355 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2356 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2357 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2358
2359 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2360 }
2361
2362 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2363 SDValue Load = DAG.getLoad(
2364 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2366
2367 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2368 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2369}
2370
2371/// Coerce an argument which was passed in a different ABI type to the original
2372/// expected value type.
2373SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2374 SDValue Val,
2375 CCValAssign &VA,
2376 const SDLoc &SL) const {
2377 EVT ValVT = VA.getValVT();
2378
2379 // If this is an 8 or 16-bit value, it is really passed promoted
2380 // to 32 bits. Insert an assert[sz]ext to capture this, then
2381 // truncate to the right size.
2382 switch (VA.getLocInfo()) {
2383 case CCValAssign::Full:
2384 return Val;
2385 case CCValAssign::BCvt:
2386 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2387 case CCValAssign::SExt:
2388 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2389 DAG.getValueType(ValVT));
2390 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2391 case CCValAssign::ZExt:
2392 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2393 DAG.getValueType(ValVT));
2394 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2395 case CCValAssign::AExt:
2396 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2397 default:
2398 llvm_unreachable("Unknown loc info!");
2399 }
2400}
2401
2402SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2403 CCValAssign &VA, const SDLoc &SL,
2404 SDValue Chain,
2405 const ISD::InputArg &Arg) const {
2406 MachineFunction &MF = DAG.getMachineFunction();
2407 MachineFrameInfo &MFI = MF.getFrameInfo();
2408
2409 if (Arg.Flags.isByVal()) {
2410 unsigned Size = Arg.Flags.getByValSize();
2411 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2412 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2413 }
2414
2415 unsigned ArgOffset = VA.getLocMemOffset();
2416 unsigned ArgSize = VA.getValVT().getStoreSize();
2417
2418 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2419
2420 // Create load nodes to retrieve arguments from the stack.
2421 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2422
2423 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2425 MVT MemVT = VA.getValVT();
2426
2427 switch (VA.getLocInfo()) {
2428 default:
2429 break;
2430 case CCValAssign::BCvt:
2431 MemVT = VA.getLocVT();
2432 break;
2433 case CCValAssign::SExt:
2434 ExtType = ISD::SEXTLOAD;
2435 break;
2436 case CCValAssign::ZExt:
2437 ExtType = ISD::ZEXTLOAD;
2438 break;
2439 case CCValAssign::AExt:
2440 ExtType = ISD::EXTLOAD;
2441 break;
2442 }
2443
2444 SDValue ArgValue = DAG.getExtLoad(
2445 ExtType, SL, VA.getLocVT(), Chain, FIN,
2447
2448 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2449 if (ConvertedVal == ArgValue)
2450 return ConvertedVal;
2451
2452 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2453}
2454
2455SDValue SITargetLowering::lowerWorkGroupId(
2456 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2459 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2460 if (!Subtarget->hasClusters())
2461 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2462
2463 // Clusters are supported. Return the global position in the grid. If clusters
2464 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2465
2466 // WorkGroupIdXYZ = ClusterId == 0 ?
2467 // ClusterIdXYZ :
2468 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2469 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2470 SDLoc SL(ClusterIdXYZ);
2471 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2472 SDValue One = DAG.getConstant(1, SL, VT);
2473 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2474 SDValue ClusterWorkGroupIdXYZ =
2475 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2476 SDValue GlobalIdXYZ =
2477 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2478 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2479
2480 switch (MFI.getClusterDims().getKind()) {
2483 return GlobalIdXYZ;
2485 return ClusterIdXYZ;
2487 using namespace AMDGPU::Hwreg;
2488 SDValue ClusterIdField =
2489 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2490 SDNode *GetReg =
2491 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2492 SDValue ClusterId(GetReg, 0);
2493 SDValue Zero = DAG.getConstant(0, SL, VT);
2494 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2495 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2496 }
2497 }
2498
2499 llvm_unreachable("nothing should reach here");
2500}
2501
2502SDValue SITargetLowering::getPreloadedValue(
2503 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2505 const ArgDescriptor *Reg = nullptr;
2506 const TargetRegisterClass *RC;
2507 LLT Ty;
2508
2510 const ArgDescriptor WorkGroupIDX =
2511 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2512 // If GridZ is not programmed in an entry function then the hardware will set
2513 // it to all zeros, so there is no need to mask the GridY value in the low
2514 // order bits.
2515 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2516 AMDGPU::TTMP7,
2517 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2518 const ArgDescriptor WorkGroupIDZ =
2519 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2520 const ArgDescriptor ClusterWorkGroupIDX =
2521 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2522 const ArgDescriptor ClusterWorkGroupIDY =
2523 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2524 const ArgDescriptor ClusterWorkGroupIDZ =
2525 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2526 const ArgDescriptor ClusterWorkGroupMaxIDX =
2527 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2528 const ArgDescriptor ClusterWorkGroupMaxIDY =
2529 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2530 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2531 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2532 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2533 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2534
2535 auto LoadConstant = [&](unsigned N) {
2536 return DAG.getConstant(N, SDLoc(), VT);
2537 };
2538
2539 if (Subtarget->hasArchitectedSGPRs() &&
2541 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2542 bool HasFixedDims = ClusterDims.isFixedDims();
2543
2544 switch (PVID) {
2546 Reg = &WorkGroupIDX;
2547 RC = &AMDGPU::SReg_32RegClass;
2548 Ty = LLT::scalar(32);
2549 break;
2551 Reg = &WorkGroupIDY;
2552 RC = &AMDGPU::SReg_32RegClass;
2553 Ty = LLT::scalar(32);
2554 break;
2556 Reg = &WorkGroupIDZ;
2557 RC = &AMDGPU::SReg_32RegClass;
2558 Ty = LLT::scalar(32);
2559 break;
2561 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2562 return LoadConstant(0);
2563 Reg = &ClusterWorkGroupIDX;
2564 RC = &AMDGPU::SReg_32RegClass;
2565 Ty = LLT::scalar(32);
2566 break;
2568 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2569 return LoadConstant(0);
2570 Reg = &ClusterWorkGroupIDY;
2571 RC = &AMDGPU::SReg_32RegClass;
2572 Ty = LLT::scalar(32);
2573 break;
2575 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2576 return LoadConstant(0);
2577 Reg = &ClusterWorkGroupIDZ;
2578 RC = &AMDGPU::SReg_32RegClass;
2579 Ty = LLT::scalar(32);
2580 break;
2582 if (HasFixedDims)
2583 return LoadConstant(ClusterDims.getDims()[0] - 1);
2584 Reg = &ClusterWorkGroupMaxIDX;
2585 RC = &AMDGPU::SReg_32RegClass;
2586 Ty = LLT::scalar(32);
2587 break;
2589 if (HasFixedDims)
2590 return LoadConstant(ClusterDims.getDims()[1] - 1);
2591 Reg = &ClusterWorkGroupMaxIDY;
2592 RC = &AMDGPU::SReg_32RegClass;
2593 Ty = LLT::scalar(32);
2594 break;
2596 if (HasFixedDims)
2597 return LoadConstant(ClusterDims.getDims()[2] - 1);
2598 Reg = &ClusterWorkGroupMaxIDZ;
2599 RC = &AMDGPU::SReg_32RegClass;
2600 Ty = LLT::scalar(32);
2601 break;
2603 Reg = &ClusterWorkGroupMaxFlatID;
2604 RC = &AMDGPU::SReg_32RegClass;
2605 Ty = LLT::scalar(32);
2606 break;
2607 default:
2608 break;
2609 }
2610 }
2611
2612 if (!Reg)
2613 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2614 if (!Reg) {
2616 // It's possible for a kernarg intrinsic call to appear in a kernel with
2617 // no allocated segment, in which case we do not add the user sgpr
2618 // argument, so just return null.
2619 return DAG.getConstant(0, SDLoc(), VT);
2620 }
2621
2622 // It's undefined behavior if a function marked with the amdgpu-no-*
2623 // attributes uses the corresponding intrinsic.
2624 return DAG.getPOISON(VT);
2625 }
2626
2627 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2628}
2629
2631 CallingConv::ID CallConv,
2632 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2633 FunctionType *FType,
2635 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2636 const ISD::InputArg *Arg = &Ins[I];
2637
2638 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2639 "vector type argument should have been split");
2640
2641 // First check if it's a PS input addr.
2642 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2643 PSInputNum <= 15) {
2644 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2645
2646 // Inconveniently only the first part of the split is marked as isSplit,
2647 // so skip to the end. We only want to increment PSInputNum once for the
2648 // entire split argument.
2649 if (Arg->Flags.isSplit()) {
2650 while (!Arg->Flags.isSplitEnd()) {
2651 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2652 "unexpected vector split in ps argument type");
2653 if (!SkipArg)
2654 Splits.push_back(*Arg);
2655 Arg = &Ins[++I];
2656 }
2657 }
2658
2659 if (SkipArg) {
2660 // We can safely skip PS inputs.
2661 Skipped.set(Arg->getOrigArgIndex());
2662 ++PSInputNum;
2663 continue;
2664 }
2665
2666 Info->markPSInputAllocated(PSInputNum);
2667 if (Arg->Used)
2668 Info->markPSInputEnabled(PSInputNum);
2669
2670 ++PSInputNum;
2671 }
2672
2673 Splits.push_back(*Arg);
2674 }
2675}
2676
2677// Allocate special inputs passed in VGPRs.
2679 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2680 SIMachineFunctionInfo &Info) const {
2681 const LLT S32 = LLT::scalar(32);
2683
2684 if (Info.hasWorkItemIDX()) {
2685 Register Reg = AMDGPU::VGPR0;
2686 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2687
2688 CCInfo.AllocateReg(Reg);
2689 unsigned Mask =
2690 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2691 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2692 }
2693
2694 if (Info.hasWorkItemIDY()) {
2695 assert(Info.hasWorkItemIDX());
2696 if (Subtarget->hasPackedTID()) {
2697 Info.setWorkItemIDY(
2698 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2699 } else {
2700 unsigned Reg = AMDGPU::VGPR1;
2701 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2702
2703 CCInfo.AllocateReg(Reg);
2704 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2705 }
2706 }
2707
2708 if (Info.hasWorkItemIDZ()) {
2709 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2710 if (Subtarget->hasPackedTID()) {
2711 Info.setWorkItemIDZ(
2712 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2713 } else {
2714 unsigned Reg = AMDGPU::VGPR2;
2715 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2716
2717 CCInfo.AllocateReg(Reg);
2718 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2719 }
2720 }
2721}
2722
2723// Try to allocate a VGPR at the end of the argument list, or if no argument
2724// VGPRs are left allocating a stack slot.
2725// If \p Mask is is given it indicates bitfield position in the register.
2726// If \p Arg is given use it with new ]p Mask instead of allocating new.
2727static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2728 ArgDescriptor Arg = ArgDescriptor()) {
2729 if (Arg.isSet())
2730 return ArgDescriptor::createArg(Arg, Mask);
2731
2732 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2733 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2734 if (RegIdx == ArgVGPRs.size()) {
2735 // Spill to stack required.
2736 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2737
2738 return ArgDescriptor::createStack(Offset, Mask);
2739 }
2740
2741 unsigned Reg = ArgVGPRs[RegIdx];
2742 Reg = CCInfo.AllocateReg(Reg);
2743 assert(Reg != AMDGPU::NoRegister);
2744
2745 MachineFunction &MF = CCInfo.getMachineFunction();
2746 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2747 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2748 return ArgDescriptor::createRegister(Reg, Mask);
2749}
2750
2752 const TargetRegisterClass *RC,
2753 unsigned NumArgRegs) {
2754 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2755 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2756 if (RegIdx == ArgSGPRs.size())
2757 report_fatal_error("ran out of SGPRs for arguments");
2758
2759 unsigned Reg = ArgSGPRs[RegIdx];
2760 Reg = CCInfo.AllocateReg(Reg);
2761 assert(Reg != AMDGPU::NoRegister);
2762
2763 MachineFunction &MF = CCInfo.getMachineFunction();
2764 MF.addLiveIn(Reg, RC);
2766}
2767
2768// If this has a fixed position, we still should allocate the register in the
2769// CCInfo state. Technically we could get away with this for values passed
2770// outside of the normal argument range.
2772 const TargetRegisterClass *RC,
2773 MCRegister Reg) {
2774 Reg = CCInfo.AllocateReg(Reg);
2775 assert(Reg != AMDGPU::NoRegister);
2776 MachineFunction &MF = CCInfo.getMachineFunction();
2777 MF.addLiveIn(Reg, RC);
2778}
2779
2780static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2781 if (Arg) {
2782 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2783 Arg.getRegister());
2784 } else
2785 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2786}
2787
2788static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2789 if (Arg) {
2790 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2791 Arg.getRegister());
2792 } else
2793 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2794}
2795
2796/// Allocate implicit function VGPR arguments at the end of allocated user
2797/// arguments.
2799 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2800 SIMachineFunctionInfo &Info) const {
2801 const unsigned Mask = 0x3ff;
2802 ArgDescriptor Arg;
2803
2804 if (Info.hasWorkItemIDX()) {
2805 Arg = allocateVGPR32Input(CCInfo, Mask);
2806 Info.setWorkItemIDX(Arg);
2807 }
2808
2809 if (Info.hasWorkItemIDY()) {
2810 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2811 Info.setWorkItemIDY(Arg);
2812 }
2813
2814 if (Info.hasWorkItemIDZ())
2815 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2816}
2817
2818/// Allocate implicit function VGPR arguments in fixed registers.
2820 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2821 SIMachineFunctionInfo &Info) const {
2822 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2823 if (!Reg)
2824 report_fatal_error("failed to allocate VGPR for implicit arguments");
2825
2826 const unsigned Mask = 0x3ff;
2827 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2828 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2829 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2830}
2831
2833 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2834 SIMachineFunctionInfo &Info) const {
2835 auto &ArgInfo = Info.getArgInfo();
2836 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2837
2838 // TODO: Unify handling with private memory pointers.
2839 if (UserSGPRInfo.hasDispatchPtr())
2840 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2841
2842 if (UserSGPRInfo.hasQueuePtr())
2843 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2844
2845 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2846 // constant offset from the kernarg segment.
2847 if (Info.hasImplicitArgPtr())
2848 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2849
2850 if (UserSGPRInfo.hasDispatchID())
2851 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2852
2853 // flat_scratch_init is not applicable for non-kernel functions.
2854
2855 if (Info.hasWorkGroupIDX())
2856 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2857
2858 if (Info.hasWorkGroupIDY())
2859 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2860
2861 if (Info.hasWorkGroupIDZ())
2862 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2863
2864 if (Info.hasLDSKernelId())
2865 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2866}
2867
2868// Allocate special inputs passed in user SGPRs.
2870 MachineFunction &MF,
2871 const SIRegisterInfo &TRI,
2872 SIMachineFunctionInfo &Info) const {
2873 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2874 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2875 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2876 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2877 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2878 }
2879
2880 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2881 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2882 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2883 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2884 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2885 }
2886
2887 if (UserSGPRInfo.hasDispatchPtr()) {
2888 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2889 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2890 CCInfo.AllocateReg(DispatchPtrReg);
2891 }
2892
2893 if (UserSGPRInfo.hasQueuePtr()) {
2894 Register QueuePtrReg = Info.addQueuePtr(TRI);
2895 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2896 CCInfo.AllocateReg(QueuePtrReg);
2897 }
2898
2899 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2901 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2902 CCInfo.AllocateReg(InputPtrReg);
2903
2904 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2905 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2906 }
2907
2908 if (UserSGPRInfo.hasDispatchID()) {
2909 Register DispatchIDReg = Info.addDispatchID(TRI);
2910 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2911 CCInfo.AllocateReg(DispatchIDReg);
2912 }
2913
2914 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2915 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2916 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2917 CCInfo.AllocateReg(FlatScratchInitReg);
2918 }
2919
2920 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2921 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2922 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2923 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2924 }
2925
2926 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2927 // these from the dispatch pointer.
2928}
2929
2930// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2931// sequential starting from the first argument.
2933 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2935 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2936 Function &F = MF.getFunction();
2937 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2938 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2939 bool InPreloadSequence = true;
2940 unsigned InIdx = 0;
2941 bool AlignedForImplictArgs = false;
2942 unsigned ImplicitArgOffset = 0;
2943 for (auto &Arg : F.args()) {
2944 if (!InPreloadSequence || !Arg.hasInRegAttr())
2945 break;
2946
2947 unsigned ArgIdx = Arg.getArgNo();
2948 // Don't preload non-original args or parts not in the current preload
2949 // sequence.
2950 if (InIdx < Ins.size() &&
2951 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2952 break;
2953
2954 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2955 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2956 InIdx++) {
2957 assert(ArgLocs[ArgIdx].isMemLoc());
2958 auto &ArgLoc = ArgLocs[InIdx];
2959 const Align KernelArgBaseAlign = Align(16);
2960 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2961 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2962 unsigned NumAllocSGPRs =
2963 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2964
2965 // Fix alignment for hidden arguments.
2966 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2967 if (!AlignedForImplictArgs) {
2968 ImplicitArgOffset =
2969 alignTo(LastExplicitArgOffset,
2970 Subtarget->getAlignmentForImplicitArgPtr()) -
2971 LastExplicitArgOffset;
2972 AlignedForImplictArgs = true;
2973 }
2974 ArgOffset += ImplicitArgOffset;
2975 }
2976
2977 // Arg is preloaded into the previous SGPR.
2978 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2979 assert(InIdx >= 1 && "No previous SGPR");
2980 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2981 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2982 continue;
2983 }
2984
2985 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2986 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2987 // Check for free user SGPRs for preloading.
2988 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2989 InPreloadSequence = false;
2990 break;
2991 }
2992
2993 // Preload this argument.
2994 const TargetRegisterClass *RC =
2995 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2996 SmallVectorImpl<MCRegister> *PreloadRegs =
2997 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2998
2999 if (PreloadRegs->size() > 1)
3000 RC = &AMDGPU::SGPR_32RegClass;
3001 for (auto &Reg : *PreloadRegs) {
3002 assert(Reg);
3003 MF.addLiveIn(Reg, RC);
3004 CCInfo.AllocateReg(Reg);
3005 }
3006
3007 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3008 }
3009 }
3010}
3011
3013 const SIRegisterInfo &TRI,
3014 SIMachineFunctionInfo &Info) const {
3015 // Always allocate this last since it is a synthetic preload.
3016 if (Info.hasLDSKernelId()) {
3017 Register Reg = Info.addLDSKernelId();
3018 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3019 CCInfo.AllocateReg(Reg);
3020 }
3021}
3022
3023// Allocate special input registers that are initialized per-wave.
3026 CallingConv::ID CallConv,
3027 bool IsShader) const {
3028 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3029 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3030 // Note: user SGPRs are handled by the front-end for graphics shaders
3031 // Pad up the used user SGPRs with dead inputs.
3032
3033 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3034 // before enabling architected SGPRs for workgroup IDs.
3035 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3036
3037 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3038 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3039 // rely on it to reach 16 since if we end up having no stack usage, it will
3040 // not really be added.
3041 unsigned NumRequiredSystemSGPRs =
3042 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3043 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3044 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3045 Register Reg = Info.addReservedUserSGPR();
3046 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3047 CCInfo.AllocateReg(Reg);
3048 }
3049 }
3050
3051 if (!HasArchitectedSGPRs) {
3052 if (Info.hasWorkGroupIDX()) {
3053 Register Reg = Info.addWorkGroupIDX();
3054 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3055 CCInfo.AllocateReg(Reg);
3056 }
3057
3058 if (Info.hasWorkGroupIDY()) {
3059 Register Reg = Info.addWorkGroupIDY();
3060 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3061 CCInfo.AllocateReg(Reg);
3062 }
3063
3064 if (Info.hasWorkGroupIDZ()) {
3065 Register Reg = Info.addWorkGroupIDZ();
3066 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3067 CCInfo.AllocateReg(Reg);
3068 }
3069 }
3070
3071 if (Info.hasWorkGroupInfo()) {
3072 Register Reg = Info.addWorkGroupInfo();
3073 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3074 CCInfo.AllocateReg(Reg);
3075 }
3076
3077 if (Info.hasPrivateSegmentWaveByteOffset()) {
3078 // Scratch wave offset passed in system SGPR.
3079 unsigned PrivateSegmentWaveByteOffsetReg;
3080
3081 if (IsShader) {
3082 PrivateSegmentWaveByteOffsetReg =
3083 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3084
3085 // This is true if the scratch wave byte offset doesn't have a fixed
3086 // location.
3087 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3088 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3089 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3090 }
3091 } else
3092 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3093
3094 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3095 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3096 }
3097
3098 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3099 Info.getNumPreloadedSGPRs() >= 16);
3100}
3101
3103 MachineFunction &MF,
3104 const SIRegisterInfo &TRI,
3106 // Now that we've figured out where the scratch register inputs are, see if
3107 // should reserve the arguments and use them directly.
3108 MachineFrameInfo &MFI = MF.getFrameInfo();
3109 bool HasStackObjects = MFI.hasStackObjects();
3110 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3111
3112 // Record that we know we have non-spill stack objects so we don't need to
3113 // check all stack objects later.
3114 if (HasStackObjects)
3115 Info.setHasNonSpillStackObjects(true);
3116
3117 // Everything live out of a block is spilled with fast regalloc, so it's
3118 // almost certain that spilling will be required.
3120 HasStackObjects = true;
3121
3122 // For now assume stack access is needed in any callee functions, so we need
3123 // the scratch registers to pass in.
3124 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3125
3126 if (!ST.enableFlatScratch()) {
3127 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3128 // If we have stack objects, we unquestionably need the private buffer
3129 // resource. For the Code Object V2 ABI, this will be the first 4 user
3130 // SGPR inputs. We can reserve those and use them directly.
3131
3132 Register PrivateSegmentBufferReg =
3134 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3135 } else {
3136 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3137 // We tentatively reserve the last registers (skipping the last registers
3138 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3139 // we'll replace these with the ones immediately after those which were
3140 // really allocated. In the prologue copies will be inserted from the
3141 // argument to these reserved registers.
3142
3143 // Without HSA, relocations are used for the scratch pointer and the
3144 // buffer resource setup is always inserted in the prologue. Scratch wave
3145 // offset is still in an input SGPR.
3146 Info.setScratchRSrcReg(ReservedBufferReg);
3147 }
3148 }
3149
3151
3152 // For entry functions we have to set up the stack pointer if we use it,
3153 // whereas non-entry functions get this "for free". This means there is no
3154 // intrinsic advantage to using S32 over S34 in cases where we do not have
3155 // calls but do need a frame pointer (i.e. if we are requested to have one
3156 // because frame pointer elimination is disabled). To keep things simple we
3157 // only ever use S32 as the call ABI stack pointer, and so using it does not
3158 // imply we need a separate frame pointer.
3159 //
3160 // Try to use s32 as the SP, but move it if it would interfere with input
3161 // arguments. This won't work with calls though.
3162 //
3163 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3164 // registers.
3165 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3166 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3167 } else {
3169
3170 if (MFI.hasCalls())
3171 report_fatal_error("call in graphics shader with too many input SGPRs");
3172
3173 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3174 if (!MRI.isLiveIn(Reg)) {
3175 Info.setStackPtrOffsetReg(Reg);
3176 break;
3177 }
3178 }
3179
3180 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3181 report_fatal_error("failed to find register for SP");
3182 }
3183
3184 // hasFP should be accurate for entry functions even before the frame is
3185 // finalized, because it does not rely on the known stack size, only
3186 // properties like whether variable sized objects are present.
3187 if (ST.getFrameLowering()->hasFP(MF)) {
3188 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3189 }
3190}
3191
3194 return !Info->isEntryFunction();
3195}
3196
3198
3200 MachineBasicBlock *Entry,
3201 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3203
3204 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3205 if (!IStart)
3206 return;
3207
3208 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3209 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3210 MachineBasicBlock::iterator MBBI = Entry->begin();
3211 for (const MCPhysReg *I = IStart; *I; ++I) {
3212 const TargetRegisterClass *RC = nullptr;
3213 if (AMDGPU::SReg_64RegClass.contains(*I))
3214 RC = &AMDGPU::SGPR_64RegClass;
3215 else if (AMDGPU::SReg_32RegClass.contains(*I))
3216 RC = &AMDGPU::SGPR_32RegClass;
3217 else
3218 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3219
3220 Register NewVR = MRI->createVirtualRegister(RC);
3221 // Create copy from CSR to a virtual register.
3222 Entry->addLiveIn(*I);
3223 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3224 .addReg(*I);
3225
3226 // Insert the copy-back instructions right before the terminator.
3227 for (auto *Exit : Exits)
3228 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3229 TII->get(TargetOpcode::COPY), *I)
3230 .addReg(NewVR);
3231 }
3232}
3233
3235 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3236 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3237 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3239
3241 const Function &Fn = MF.getFunction();
3244 bool IsError = false;
3245
3246 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3248 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3249 IsError = true;
3250 }
3251
3254 BitVector Skipped(Ins.size());
3255 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3256 *DAG.getContext());
3257
3258 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3259 bool IsKernel = AMDGPU::isKernel(CallConv);
3260 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3261
3262 if (IsGraphics) {
3263 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3264 assert(!UserSGPRInfo.hasDispatchPtr() &&
3265 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3266 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3267 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3268 (void)UserSGPRInfo;
3269 if (!Subtarget->enableFlatScratch())
3270 assert(!UserSGPRInfo.hasFlatScratchInit());
3271 if ((CallConv != CallingConv::AMDGPU_CS &&
3272 CallConv != CallingConv::AMDGPU_Gfx &&
3273 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3274 !Subtarget->hasArchitectedSGPRs())
3275 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3276 !Info->hasWorkGroupIDZ());
3277 }
3278
3279 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3280
3281 if (CallConv == CallingConv::AMDGPU_PS) {
3282 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3283
3284 // At least one interpolation mode must be enabled or else the GPU will
3285 // hang.
3286 //
3287 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3288 // set PSInputAddr, the user wants to enable some bits after the compilation
3289 // based on run-time states. Since we can't know what the final PSInputEna
3290 // will look like, so we shouldn't do anything here and the user should take
3291 // responsibility for the correct programming.
3292 //
3293 // Otherwise, the following restrictions apply:
3294 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3295 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3296 // enabled too.
3297 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3298 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3299 CCInfo.AllocateReg(AMDGPU::VGPR0);
3300 CCInfo.AllocateReg(AMDGPU::VGPR1);
3301 Info->markPSInputAllocated(0);
3302 Info->markPSInputEnabled(0);
3303 }
3304 if (Subtarget->isAmdPalOS()) {
3305 // For isAmdPalOS, the user does not enable some bits after compilation
3306 // based on run-time states; the register values being generated here are
3307 // the final ones set in hardware. Therefore we need to apply the
3308 // workaround to PSInputAddr and PSInputEnable together. (The case where
3309 // a bit is set in PSInputAddr but not PSInputEnable is where the
3310 // frontend set up an input arg for a particular interpolation mode, but
3311 // nothing uses that input arg. Really we should have an earlier pass
3312 // that removes such an arg.)
3313 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3314 if ((PsInputBits & 0x7F) == 0 ||
3315 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3316 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3317 }
3318 } else if (IsKernel) {
3319 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3320 } else {
3321 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3322 Ins.end());
3323 }
3324
3325 if (IsKernel)
3326 analyzeFormalArgumentsCompute(CCInfo, Ins);
3327
3328 if (IsEntryFunc) {
3329 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3330 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3331 if (IsKernel && Subtarget->hasKernargPreload())
3332 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3333
3334 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3335 } else if (!IsGraphics) {
3336 // For the fixed ABI, pass workitem IDs in the last argument register.
3337 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3338
3339 // FIXME: Sink this into allocateSpecialInputSGPRs
3340 if (!Subtarget->enableFlatScratch())
3341 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3342
3343 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3344 }
3345
3346 if (!IsKernel) {
3347 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3348 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3349
3350 // This assumes the registers are allocated by CCInfo in ascending order
3351 // with no gaps.
3352 Info->setNumWaveDispatchSGPRs(
3353 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3354 Info->setNumWaveDispatchVGPRs(
3355 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3356 } else if (Info->getNumKernargPreloadedSGPRs()) {
3357 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3358 }
3359
3361
3362 if (IsWholeWaveFunc) {
3363 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3364 {MVT::i1, MVT::Other}, Chain);
3365 InVals.push_back(Setup.getValue(0));
3366 Chains.push_back(Setup.getValue(1));
3367 }
3368
3369 // FIXME: This is the minimum kernel argument alignment. We should improve
3370 // this to the maximum alignment of the arguments.
3371 //
3372 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3373 // kern arg offset.
3374 const Align KernelArgBaseAlign = Align(16);
3375
3376 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3377 ++i) {
3378 const ISD::InputArg &Arg = Ins[i];
3379 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3380 InVals.push_back(DAG.getPOISON(Arg.VT));
3381 continue;
3382 }
3383
3384 CCValAssign &VA = ArgLocs[ArgIdx++];
3385 MVT VT = VA.getLocVT();
3386
3387 if (IsEntryFunc && VA.isMemLoc()) {
3388 VT = Ins[i].VT;
3389 EVT MemVT = VA.getLocVT();
3390
3391 const uint64_t Offset = VA.getLocMemOffset();
3392 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3393
3394 if (Arg.Flags.isByRef()) {
3395 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3396
3397 const GCNTargetMachine &TM =
3398 static_cast<const GCNTargetMachine &>(getTargetMachine());
3399 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3400 Arg.Flags.getPointerAddrSpace())) {
3403 }
3404
3405 InVals.push_back(Ptr);
3406 continue;
3407 }
3408
3409 SDValue NewArg;
3410 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3411 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3412 // In this case the argument is packed into the previous preload SGPR.
3413 int64_t AlignDownOffset = alignDown(Offset, 4);
3414 int64_t OffsetDiff = Offset - AlignDownOffset;
3415 EVT IntVT = MemVT.changeTypeToInteger();
3416
3417 const SIMachineFunctionInfo *Info =
3420 Register Reg =
3421 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3422
3423 assert(Reg);
3424 Register VReg = MRI.getLiveInVirtReg(Reg);
3425 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3426
3427 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3428 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3429
3430 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3431 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3432 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3433 Ins[i].Flags.isSExt(), &Ins[i]);
3434
3435 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3436 } else {
3437 const SIMachineFunctionInfo *Info =
3440 const SmallVectorImpl<MCRegister> &PreloadRegs =
3441 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3442
3443 SDValue Copy;
3444 if (PreloadRegs.size() == 1) {
3445 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3446 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3447 NewArg = DAG.getCopyFromReg(
3448 Chain, DL, VReg,
3450 TRI->getRegSizeInBits(*RC)));
3451
3452 } else {
3453 // If the kernarg alignment does not match the alignment of the SGPR
3454 // tuple RC that can accommodate this argument, it will be built up
3455 // via copies from from the individual SGPRs that the argument was
3456 // preloaded to.
3458 for (auto Reg : PreloadRegs) {
3459 Register VReg = MRI.getLiveInVirtReg(Reg);
3460 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3461 Elts.push_back(Copy);
3462 }
3463 NewArg =
3464 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3465 PreloadRegs.size()),
3466 DL, Elts);
3467 }
3468
3469 // If the argument was preloaded to multiple consecutive 32-bit
3470 // registers because of misalignment between addressable SGPR tuples
3471 // and the argument size, we can still assume that because of kernarg
3472 // segment alignment restrictions that NewArg's size is the same as
3473 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3474 // truncate since we cannot preload to less than a single SGPR and the
3475 // MemVT may be smaller.
3476 EVT MemVTInt =
3478 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3479 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3480
3481 NewArg = DAG.getBitcast(MemVT, NewArg);
3482 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3483 Ins[i].Flags.isSExt(), &Ins[i]);
3484 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3485 }
3486 } else {
3487 // Hidden arguments that are in the kernel signature must be preloaded
3488 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3489 // the argument list and is not preloaded.
3490 if (Arg.isOrigArg()) {
3491 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3492 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3494 *OrigArg->getParent(),
3495 "hidden argument in kernel signature was not preloaded",
3496 DL.getDebugLoc()));
3497 }
3498 }
3499
3500 NewArg =
3501 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3502 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3503 }
3504 Chains.push_back(NewArg.getValue(1));
3505
3506 auto *ParamTy =
3507 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3508 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3509 ParamTy &&
3510 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3511 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3512 // On SI local pointers are just offsets into LDS, so they are always
3513 // less than 16-bits. On CI and newer they could potentially be
3514 // real pointers, so we can't guarantee their size.
3515 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3516 DAG.getValueType(MVT::i16));
3517 }
3518
3519 InVals.push_back(NewArg);
3520 continue;
3521 }
3522 if (!IsEntryFunc && VA.isMemLoc()) {
3523 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3524 InVals.push_back(Val);
3525 if (!Arg.Flags.isByVal())
3526 Chains.push_back(Val.getValue(1));
3527 continue;
3528 }
3529
3530 assert(VA.isRegLoc() && "Parameter must be in a register!");
3531
3532 Register Reg = VA.getLocReg();
3533 const TargetRegisterClass *RC = nullptr;
3534 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3535 RC = &AMDGPU::VGPR_32RegClass;
3536 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3537 RC = &AMDGPU::SGPR_32RegClass;
3538 else
3539 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3540
3541 Reg = MF.addLiveIn(Reg, RC);
3542 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3543
3544 if (Arg.Flags.isSRet()) {
3545 // The return object should be reasonably addressable.
3546
3547 // FIXME: This helps when the return is a real sret. If it is a
3548 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3549 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3550 unsigned NumBits =
3552 Val = DAG.getNode(
3553 ISD::AssertZext, DL, VT, Val,
3554 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3555 }
3556
3557 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3558 InVals.push_back(Val);
3559 }
3560
3561 // Start adding system SGPRs.
3562 if (IsEntryFunc)
3563 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3564
3565 if (DAG.getPass()) {
3566 auto &ArgUsageInfo =
3568 ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo());
3569 } else if (auto *MFAM = DAG.getMFAM()) {
3570 Module &M = *MF.getFunction().getParent();
3571 auto *ArgUsageInfo =
3573 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3574 if (ArgUsageInfo)
3575 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3576 }
3577
3578 unsigned StackArgSize = CCInfo.getStackSize();
3579 Info->setBytesInStackArgArea(StackArgSize);
3580
3581 return Chains.empty() ? Chain
3582 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3583}
3584
3585// TODO: If return values can't fit in registers, we should return as many as
3586// possible in registers before passing on stack.
3588 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3589 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3590 const Type *RetTy) const {
3591 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3592 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3593 // for shaders. Vector types should be explicitly handled by CC.
3594 if (AMDGPU::isEntryFunctionCC(CallConv))
3595 return true;
3596
3598 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3599 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3600 return false;
3601
3602 // We must use the stack if return would require unavailable registers.
3603 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3604 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3605 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3606 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3607 return false;
3608
3609 return true;
3610}
3611
3612SDValue
3614 bool isVarArg,
3616 const SmallVectorImpl<SDValue> &OutVals,
3617 const SDLoc &DL, SelectionDAG &DAG) const {
3621
3622 if (AMDGPU::isKernel(CallConv)) {
3623 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3624 OutVals, DL, DAG);
3625 }
3626
3627 bool IsShader = AMDGPU::isShader(CallConv);
3628
3629 Info->setIfReturnsVoid(Outs.empty());
3630 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3631
3632 // CCValAssign - represent the assignment of the return value to a location.
3634
3635 // CCState - Info about the registers and stack slots.
3636 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3637 *DAG.getContext());
3638
3639 // Analyze outgoing return values.
3640 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3641
3642 SDValue Glue;
3644 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3645
3646 SDValue ReadFirstLane =
3647 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3648 // Copy the result values into the output registers.
3649 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3650 ++I, ++RealRVLocIdx) {
3651 CCValAssign &VA = RVLocs[I];
3652 assert(VA.isRegLoc() && "Can only return in registers!");
3653 // TODO: Partially return in registers if return values don't fit.
3654 SDValue Arg = OutVals[RealRVLocIdx];
3655
3656 // Copied from other backends.
3657 switch (VA.getLocInfo()) {
3658 case CCValAssign::Full:
3659 break;
3660 case CCValAssign::BCvt:
3661 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3662 break;
3663 case CCValAssign::SExt:
3664 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3665 break;
3666 case CCValAssign::ZExt:
3667 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3668 break;
3669 case CCValAssign::AExt:
3670 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3671 break;
3672 default:
3673 llvm_unreachable("Unknown loc info!");
3674 }
3675 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3677 ReadFirstLane, Arg);
3678 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3679 Glue = Chain.getValue(1);
3680 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3681 }
3682
3683 // FIXME: Does sret work properly?
3684 if (!Info->isEntryFunction()) {
3685 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3686 const MCPhysReg *I =
3687 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3688 if (I) {
3689 for (; *I; ++I) {
3690 if (AMDGPU::SReg_64RegClass.contains(*I))
3691 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3692 else if (AMDGPU::SReg_32RegClass.contains(*I))
3693 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3694 else
3695 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3696 }
3697 }
3698 }
3699
3700 // Update chain and glue.
3701 RetOps[0] = Chain;
3702 if (Glue.getNode())
3703 RetOps.push_back(Glue);
3704
3705 unsigned Opc = AMDGPUISD::ENDPGM;
3706 if (!IsWaveEnd)
3707 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3708 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3709 : AMDGPUISD::RET_GLUE;
3710 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3711}
3712
3714 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3715 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3716 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3717 SDValue ThisVal) const {
3718 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3719
3720 // Assign locations to each value returned by this call.
3722 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3723 *DAG.getContext());
3724 CCInfo.AnalyzeCallResult(Ins, RetCC);
3725
3726 // Copy all of the result registers out of their specified physreg.
3727 for (CCValAssign VA : RVLocs) {
3728 SDValue Val;
3729
3730 if (VA.isRegLoc()) {
3731 Val =
3732 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3733 Chain = Val.getValue(1);
3734 InGlue = Val.getValue(2);
3735 } else if (VA.isMemLoc()) {
3736 report_fatal_error("TODO: return values in memory");
3737 } else
3738 llvm_unreachable("unknown argument location type");
3739
3740 switch (VA.getLocInfo()) {
3741 case CCValAssign::Full:
3742 break;
3743 case CCValAssign::BCvt:
3744 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3745 break;
3746 case CCValAssign::ZExt:
3747 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3748 DAG.getValueType(VA.getValVT()));
3749 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3750 break;
3751 case CCValAssign::SExt:
3752 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3753 DAG.getValueType(VA.getValVT()));
3754 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3755 break;
3756 case CCValAssign::AExt:
3757 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3758 break;
3759 default:
3760 llvm_unreachable("Unknown loc info!");
3761 }
3762
3763 InVals.push_back(Val);
3764 }
3765
3766 return Chain;
3767}
3768
3769// Add code to pass special inputs required depending on used features separate
3770// from the explicit user arguments present in the IR.
3772 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3773 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3774 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3775 // If we don't have a call site, this was a call inserted by
3776 // legalization. These can never use special inputs.
3777 if (!CLI.CB)
3778 return;
3779
3780 SelectionDAG &DAG = CLI.DAG;
3781 const SDLoc &DL = CLI.DL;
3782 const Function &F = DAG.getMachineFunction().getFunction();
3783
3784 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3785 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3786
3787 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3789 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3790 if (DAG.getPass()) {
3791 auto &ArgUsageInfo =
3793 CalleeArgInfo =
3794 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3795 } else if (auto *MFAM = DAG.getMFAM()) {
3797 auto *ArgUsageInfo =
3799 DAG.getMachineFunction())
3800 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3801 if (ArgUsageInfo)
3802 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3803 }
3804 }
3805
3806 // TODO: Unify with private memory register handling. This is complicated by
3807 // the fact that at least in kernels, the input argument is not necessarily
3808 // in the same location as the input.
3809 // clang-format off
3810 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3811 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3812 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3813 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3814 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3815 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3816 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3817 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3818 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3819 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3820 };
3821 // clang-format on
3822
3823 for (auto [InputID, Attrs] : ImplicitAttrs) {
3824 // If the callee does not use the attribute value, skip copying the value.
3825 if (all_of(Attrs, [&](StringRef Attr) {
3826 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3827 }))
3828 continue;
3829
3830 const auto [OutgoingArg, ArgRC, ArgTy] =
3831 CalleeArgInfo->getPreloadedValue(InputID);
3832 if (!OutgoingArg)
3833 continue;
3834
3835 const auto [IncomingArg, IncomingArgRC, Ty] =
3836 CallerArgInfo.getPreloadedValue(InputID);
3837 assert(IncomingArgRC == ArgRC);
3838
3839 // All special arguments are ints for now.
3840 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3841 SDValue InputReg;
3842
3843 if (IncomingArg) {
3844 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3845 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3846 // The implicit arg ptr is special because it doesn't have a corresponding
3847 // input for kernels, and is computed from the kernarg segment pointer.
3848 InputReg = getImplicitArgPtr(DAG, DL);
3849 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3850 std::optional<uint32_t> Id =
3852 if (Id.has_value()) {
3853 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3854 } else {
3855 InputReg = DAG.getPOISON(ArgVT);
3856 }
3857 } else {
3858 // We may have proven the input wasn't needed, although the ABI is
3859 // requiring it. We just need to allocate the register appropriately.
3860 InputReg = DAG.getPOISON(ArgVT);
3861 }
3862
3863 if (OutgoingArg->isRegister()) {
3864 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3865 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3866 report_fatal_error("failed to allocate implicit input argument");
3867 } else {
3868 unsigned SpecialArgOffset =
3869 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3870 SDValue ArgStore =
3871 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3872 MemOpChains.push_back(ArgStore);
3873 }
3874 }
3875
3876 // Pack workitem IDs into a single register or pass it as is if already
3877 // packed.
3878
3879 auto [OutgoingArg, ArgRC, Ty] =
3881 if (!OutgoingArg)
3882 std::tie(OutgoingArg, ArgRC, Ty) =
3884 if (!OutgoingArg)
3885 std::tie(OutgoingArg, ArgRC, Ty) =
3887 if (!OutgoingArg)
3888 return;
3889
3890 const ArgDescriptor *IncomingArgX = std::get<0>(
3892 const ArgDescriptor *IncomingArgY = std::get<0>(
3894 const ArgDescriptor *IncomingArgZ = std::get<0>(
3896
3897 SDValue InputReg;
3898 SDLoc SL;
3899
3900 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3901 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3902 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3903
3904 // If incoming ids are not packed we need to pack them.
3905 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3906 NeedWorkItemIDX) {
3907 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3908 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3909 } else {
3910 InputReg = DAG.getConstant(0, DL, MVT::i32);
3911 }
3912 }
3913
3914 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3915 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3916 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3917 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3918 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3919 InputReg = InputReg.getNode()
3920 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3921 : Y;
3922 }
3923
3924 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3925 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3926 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3927 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3928 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3929 InputReg = InputReg.getNode()
3930 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3931 : Z;
3932 }
3933
3934 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3935 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3936 // We're in a situation where the outgoing function requires the workitem
3937 // ID, but the calling function does not have it (e.g a graphics function
3938 // calling a C calling convention function). This is illegal, but we need
3939 // to produce something.
3940 InputReg = DAG.getPOISON(MVT::i32);
3941 } else {
3942 // Workitem ids are already packed, any of present incoming arguments
3943 // will carry all required fields.
3944 ArgDescriptor IncomingArg =
3945 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3946 : IncomingArgY ? *IncomingArgY
3947 : *IncomingArgZ,
3948 ~0u);
3949 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3950 }
3951 }
3952
3953 if (OutgoingArg->isRegister()) {
3954 if (InputReg)
3955 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3956
3957 CCInfo.AllocateReg(OutgoingArg->getRegister());
3958 } else {
3959 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3960 if (InputReg) {
3961 SDValue ArgStore =
3962 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3963 MemOpChains.push_back(ArgStore);
3964 }
3965 }
3966}
3967
3969 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3971 const SmallVectorImpl<SDValue> &OutVals,
3972 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3973 if (AMDGPU::isChainCC(CalleeCC))
3974 return true;
3975
3976 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3977 return false;
3978
3979 // For a divergent call target, we need to do a waterfall loop over the
3980 // possible callees which precludes us from using a simple jump.
3981 if (Callee->isDivergent())
3982 return false;
3983
3985 const Function &CallerF = MF.getFunction();
3986 CallingConv::ID CallerCC = CallerF.getCallingConv();
3988 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3989
3990 // Kernels aren't callable, and don't have a live in return address so it
3991 // doesn't make sense to do a tail call with entry functions.
3992 if (!CallerPreserved)
3993 return false;
3994
3995 bool CCMatch = CallerCC == CalleeCC;
3996
3998 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3999 return true;
4000 return false;
4001 }
4002
4003 // TODO: Can we handle var args?
4004 if (IsVarArg)
4005 return false;
4006
4007 for (const Argument &Arg : CallerF.args()) {
4008 if (Arg.hasByValAttr())
4009 return false;
4010 }
4011
4012 LLVMContext &Ctx = *DAG.getContext();
4013
4014 // Check that the call results are passed in the same way.
4015 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4016 CCAssignFnForCall(CalleeCC, IsVarArg),
4017 CCAssignFnForCall(CallerCC, IsVarArg)))
4018 return false;
4019
4020 // The callee has to preserve all registers the caller needs to preserve.
4021 if (!CCMatch) {
4022 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4023 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4024 return false;
4025 }
4026
4027 // Nothing more to check if the callee is taking no arguments.
4028 if (Outs.empty())
4029 return true;
4030
4032 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4033
4034 // FIXME: We are not allocating special input registers, so we will be
4035 // deciding based on incorrect register assignments.
4036 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4037
4038 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4039 // If the stack arguments for this call do not fit into our own save area then
4040 // the call cannot be made tail.
4041 // TODO: Is this really necessary?
4042 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4043 return false;
4044
4045 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4046 // FIXME: What about inreg arguments that end up passed in memory?
4047 if (!CCVA.isRegLoc())
4048 continue;
4049
4050 // If we are passing an argument in an SGPR, and the value is divergent,
4051 // this call requires a waterfall loop.
4052 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4053 LLVM_DEBUG(
4054 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4055 << printReg(CCVA.getLocReg(), TRI) << '\n');
4056 return false;
4057 }
4058 }
4059
4060 const MachineRegisterInfo &MRI = MF.getRegInfo();
4061 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4062}
4063
4065 if (!CI->isTailCall())
4066 return false;
4067
4068 const Function *ParentFn = CI->getFunction();
4070 return false;
4071 return true;
4072}
4073
4074namespace {
4075// Chain calls have special arguments that we need to handle. These are
4076// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4077// arguments (index 0 and 1 respectively).
4078enum ChainCallArgIdx {
4079 Exec = 2,
4080 Flags,
4081 NumVGPRs,
4082 FallbackExec,
4083 FallbackCallee
4084};
4085} // anonymous namespace
4086
4087// The wave scratch offset register is used as the global base pointer.
4089 SmallVectorImpl<SDValue> &InVals) const {
4090 CallingConv::ID CallConv = CLI.CallConv;
4091 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4092
4093 SelectionDAG &DAG = CLI.DAG;
4094
4095 const SDLoc &DL = CLI.DL;
4096 SDValue Chain = CLI.Chain;
4097 SDValue Callee = CLI.Callee;
4098
4099 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4100 bool UsesDynamicVGPRs = false;
4101 if (IsChainCallConv) {
4102 // The last arguments should be the value that we need to put in EXEC,
4103 // followed by the flags and any other arguments with special meanings.
4104 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4105 // we don't treat them like the "real" arguments.
4106 auto RequestedExecIt =
4107 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4108 return Arg.OrigArgIndex == 2;
4109 });
4110 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4111
4112 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4113 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4114 CLI.OutVals.end());
4115 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4116
4117 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4118 "Haven't popped all the special args");
4119
4120 TargetLowering::ArgListEntry RequestedExecArg =
4121 CLI.Args[ChainCallArgIdx::Exec];
4122 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4123 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4124
4125 // Convert constants into TargetConstants, so they become immediate operands
4126 // instead of being selected into S_MOV.
4127 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4128 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4129 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4130 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4131 } else
4132 ChainCallSpecialArgs.push_back(Arg.Node);
4133 };
4134
4135 PushNodeOrTargetConstant(RequestedExecArg);
4136
4137 // Process any other special arguments depending on the value of the flags.
4138 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4139
4140 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4141 if (FlagsValue.isZero()) {
4142 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4143 return lowerUnhandledCall(CLI, InVals,
4144 "no additional args allowed if flags == 0");
4145 } else if (FlagsValue.isOneBitSet(0)) {
4146 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4147 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4148 }
4149
4150 if (!Subtarget->isWave32()) {
4151 return lowerUnhandledCall(
4152 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4153 }
4154
4155 UsesDynamicVGPRs = true;
4156 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4157 CLI.Args.end(), PushNodeOrTargetConstant);
4158 }
4159 }
4160
4162 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4164 bool &IsTailCall = CLI.IsTailCall;
4165 bool IsVarArg = CLI.IsVarArg;
4166 bool IsSibCall = false;
4168
4169 if (Callee.isUndef() || isNullConstant(Callee)) {
4170 if (!CLI.IsTailCall) {
4171 for (ISD::InputArg &Arg : CLI.Ins)
4172 InVals.push_back(DAG.getPOISON(Arg.VT));
4173 }
4174
4175 return Chain;
4176 }
4177
4178 if (IsVarArg) {
4179 return lowerUnhandledCall(CLI, InVals,
4180 "unsupported call to variadic function ");
4181 }
4182
4183 if (!CLI.CB)
4184 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4185
4186 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4187 return lowerUnhandledCall(CLI, InVals,
4188 "unsupported required tail call to function ");
4189 }
4190
4191 if (IsTailCall) {
4192 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4193 Outs, OutVals, Ins, DAG);
4194 if (!IsTailCall &&
4195 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4196 report_fatal_error("failed to perform tail call elimination on a call "
4197 "site marked musttail or on llvm.amdgcn.cs.chain");
4198 }
4199
4200 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4201
4202 // A sibling call is one where we're under the usual C ABI and not planning
4203 // to change that but can still do a tail call:
4204 if (!TailCallOpt && IsTailCall)
4205 IsSibCall = true;
4206
4207 if (IsTailCall)
4208 ++NumTailCalls;
4209 }
4210
4213 SmallVector<SDValue, 8> MemOpChains;
4214
4215 // Analyze operands of the call, assigning locations to each operand.
4217 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4218 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4219
4220 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4222 // With a fixed ABI, allocate fixed registers before user arguments.
4223 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4224 }
4225
4226 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4227
4228 // Get a count of how many bytes are to be pushed on the stack.
4229 unsigned NumBytes = CCInfo.getStackSize();
4230
4231 if (IsSibCall) {
4232 // Since we're not changing the ABI to make this a tail call, the memory
4233 // operands are already available in the caller's incoming argument space.
4234 NumBytes = 0;
4235 }
4236
4237 // FPDiff is the byte offset of the call's argument area from the callee's.
4238 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4239 // by this amount for a tail call. In a sibling call it must be 0 because the
4240 // caller will deallocate the entire stack and the callee still expects its
4241 // arguments to begin at SP+0. Completely unused for non-tail calls.
4242 int32_t FPDiff = 0;
4243 MachineFrameInfo &MFI = MF.getFrameInfo();
4244 auto *TRI = Subtarget->getRegisterInfo();
4245
4246 // Adjust the stack pointer for the new arguments...
4247 // These operations are automatically eliminated by the prolog/epilog pass
4248 if (!IsSibCall)
4249 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4250
4251 if (!IsSibCall || IsChainCallConv) {
4252 if (!Subtarget->enableFlatScratch()) {
4253 SmallVector<SDValue, 4> CopyFromChains;
4254
4255 // In the HSA case, this should be an identity copy.
4256 SDValue ScratchRSrcReg =
4257 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4258 RegsToPass.emplace_back(IsChainCallConv
4259 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4260 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4261 ScratchRSrcReg);
4262 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4263 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4264 }
4265 }
4266
4267 const unsigned NumSpecialInputs = RegsToPass.size();
4268
4269 MVT PtrVT = MVT::i32;
4270
4271 // Walk the register/memloc assignments, inserting copies/loads.
4272 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4273 CCValAssign &VA = ArgLocs[i];
4274 SDValue Arg = OutVals[i];
4275
4276 // Promote the value if needed.
4277 switch (VA.getLocInfo()) {
4278 case CCValAssign::Full:
4279 break;
4280 case CCValAssign::BCvt:
4281 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4282 break;
4283 case CCValAssign::ZExt:
4284 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4285 break;
4286 case CCValAssign::SExt:
4287 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4288 break;
4289 case CCValAssign::AExt:
4290 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4291 break;
4292 case CCValAssign::FPExt:
4293 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4294 break;
4295 default:
4296 llvm_unreachable("Unknown loc info!");
4297 }
4298
4299 if (VA.isRegLoc()) {
4300 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4301 } else {
4302 assert(VA.isMemLoc());
4303
4304 SDValue DstAddr;
4305 MachinePointerInfo DstInfo;
4306
4307 unsigned LocMemOffset = VA.getLocMemOffset();
4308 int32_t Offset = LocMemOffset;
4309
4310 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4311 MaybeAlign Alignment;
4312
4313 if (IsTailCall) {
4314 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4315 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4316 : VA.getValVT().getStoreSize();
4317
4318 // FIXME: We can have better than the minimum byval required alignment.
4319 Alignment =
4320 Flags.isByVal()
4321 ? Flags.getNonZeroByValAlign()
4322 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4323
4324 Offset = Offset + FPDiff;
4325 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4326
4327 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4328 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4329
4330 // Make sure any stack arguments overlapping with where we're storing
4331 // are loaded before this eventual operation. Otherwise they'll be
4332 // clobbered.
4333
4334 // FIXME: Why is this really necessary? This seems to just result in a
4335 // lot of code to copy the stack and write them back to the same
4336 // locations, which are supposed to be immutable?
4337 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4338 } else {
4339 // Stores to the argument stack area are relative to the stack pointer.
4340 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4341 MVT::i32);
4342 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4343 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4344 Alignment =
4345 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4346 }
4347
4348 if (Outs[i].Flags.isByVal()) {
4349 SDValue SizeNode =
4350 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4351 SDValue Cpy =
4352 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4353 Outs[i].Flags.getNonZeroByValAlign(),
4354 /*isVol = */ false, /*AlwaysInline = */ true,
4355 /*CI=*/nullptr, std::nullopt, DstInfo,
4357
4358 MemOpChains.push_back(Cpy);
4359 } else {
4360 SDValue Store =
4361 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4362 MemOpChains.push_back(Store);
4363 }
4364 }
4365 }
4366
4367 if (!MemOpChains.empty())
4368 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4369
4370 SDValue ReadFirstLaneID =
4371 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4372
4373 SDValue TokenGlue;
4374 if (CLI.ConvergenceControlToken) {
4375 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4377 }
4378
4379 // Build a sequence of copy-to-reg nodes chained together with token chain
4380 // and flag operands which copy the outgoing args into the appropriate regs.
4381 SDValue InGlue;
4382
4383 unsigned ArgIdx = 0;
4384 for (auto [Reg, Val] : RegsToPass) {
4385 if (ArgIdx++ >= NumSpecialInputs &&
4386 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4387 // For chain calls, the inreg arguments are required to be
4388 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4389 // they are uniform.
4390 //
4391 // For other calls, if an inreg arguments is known to be uniform,
4392 // speculatively insert a readfirstlane in case it is in a VGPR.
4393 //
4394 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4395 // value, so let that continue to produce invalid code.
4396
4397 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4398 if (TokenGlue)
4399 ReadfirstlaneArgs.push_back(TokenGlue);
4401 ReadfirstlaneArgs);
4402 }
4403
4404 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4405 InGlue = Chain.getValue(1);
4406 }
4407
4408 // We don't usually want to end the call-sequence here because we would tidy
4409 // the frame up *after* the call, however in the ABI-changing tail-call case
4410 // we've carefully laid out the parameters so that when sp is reset they'll be
4411 // in the correct location.
4412 if (IsTailCall && !IsSibCall) {
4413 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4414 InGlue = Chain.getValue(1);
4415 }
4416
4417 std::vector<SDValue> Ops({Chain});
4418
4419 // Add a redundant copy of the callee global which will not be legalized, as
4420 // we need direct access to the callee later.
4422 const GlobalValue *GV = GSD->getGlobal();
4423 Ops.push_back(Callee);
4424 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4425 } else {
4426 if (IsTailCall) {
4427 // isEligibleForTailCallOptimization considered whether the call target is
4428 // divergent, but we may still end up with a uniform value in a VGPR.
4429 // Insert a readfirstlane just in case.
4430 SDValue ReadFirstLaneID =
4431 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4432
4433 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4434 if (TokenGlue)
4435 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4436 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4437 ReadfirstlaneArgs);
4438 }
4439
4440 Ops.push_back(Callee);
4441 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4442 }
4443
4444 if (IsTailCall) {
4445 // Each tail call may have to adjust the stack by a different amount, so
4446 // this information must travel along with the operation for eventual
4447 // consumption by emitEpilogue.
4448 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4449 }
4450
4451 if (IsChainCallConv)
4452 llvm::append_range(Ops, ChainCallSpecialArgs);
4453
4454 // Add argument registers to the end of the list so that they are known live
4455 // into the call.
4456 for (auto &[Reg, Val] : RegsToPass)
4457 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4458
4459 // Add a register mask operand representing the call-preserved registers.
4460 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4461 assert(Mask && "Missing call preserved mask for calling convention");
4462 Ops.push_back(DAG.getRegisterMask(Mask));
4463
4464 if (SDValue Token = CLI.ConvergenceControlToken) {
4466 GlueOps.push_back(Token);
4467 if (InGlue)
4468 GlueOps.push_back(InGlue);
4469
4470 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4471 MVT::Glue, GlueOps),
4472 0);
4473 }
4474
4475 if (InGlue)
4476 Ops.push_back(InGlue);
4477
4478 // If we're doing a tall call, use a TC_RETURN here rather than an
4479 // actual call instruction.
4480 if (IsTailCall) {
4481 MFI.setHasTailCall();
4482 unsigned OPC = AMDGPUISD::TC_RETURN;
4483 switch (CallConv) {
4485 OPC = AMDGPUISD::TC_RETURN_GFX;
4486 break;
4489 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4490 : AMDGPUISD::TC_RETURN_CHAIN;
4491 break;
4492 }
4493
4494 // If the caller is a whole wave function, we need to use a special opcode
4495 // so we can patch up EXEC.
4496 if (Info->isWholeWaveFunction())
4497 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4498
4499 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4500 }
4501
4502 // Returns a chain and a flag for retval copy to use.
4503 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4504 Chain = Call.getValue(0);
4505 InGlue = Call.getValue(1);
4506
4507 uint64_t CalleePopBytes = NumBytes;
4508 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4509 if (!Ins.empty())
4510 InGlue = Chain.getValue(1);
4511
4512 // Handle result values, copying them out of physregs into vregs that we
4513 // return.
4514 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4515 InVals, /*IsThisReturn=*/false, SDValue());
4516}
4517
4518// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4519// except for:
4520// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4521// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4523 SelectionDAG &DAG) const {
4524 const MachineFunction &MF = DAG.getMachineFunction();
4526
4527 SDLoc dl(Op);
4528 EVT VT = Op.getValueType();
4529 SDValue Chain = Op.getOperand(0);
4530 Register SPReg = Info->getStackPtrOffsetReg();
4531
4532 // Chain the dynamic stack allocation so that it doesn't modify the stack
4533 // pointer when other instructions are using the stack.
4534 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4535
4536 SDValue Size = Op.getOperand(1);
4537 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4538 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4539
4540 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4542 "Stack grows upwards for AMDGPU");
4543
4544 Chain = BaseAddr.getValue(1);
4545 Align StackAlign = TFL->getStackAlign();
4546 if (Alignment > StackAlign) {
4547 uint64_t ScaledAlignment = Alignment.value()
4548 << Subtarget->getWavefrontSizeLog2();
4549 uint64_t StackAlignMask = ScaledAlignment - 1;
4550 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4551 DAG.getConstant(StackAlignMask, dl, VT));
4552 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4553 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4554 }
4555
4556 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4557 SDValue NewSP;
4559 // For constant sized alloca, scale alloca size by wave-size
4560 SDValue ScaledSize = DAG.getNode(
4561 ISD::SHL, dl, VT, Size,
4562 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4563 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4564 } else {
4565 // For dynamic sized alloca, perform wave-wide reduction to get max of
4566 // alloca size(divergent) and then scale it by wave-size
4567 SDValue WaveReduction =
4568 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4569 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4570 Size, DAG.getConstant(0, dl, MVT::i32));
4571 SDValue ScaledSize = DAG.getNode(
4572 ISD::SHL, dl, VT, Size,
4573 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4574 NewSP =
4575 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4576 SDValue ReadFirstLaneID =
4577 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4578 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4579 NewSP);
4580 }
4581
4582 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4583 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4584
4585 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4586}
4587
4589 if (Op.getValueType() != MVT::i32)
4590 return Op; // Defer to cannot select error.
4591
4593 SDLoc SL(Op);
4594
4595 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4596
4597 // Convert from wave uniform to swizzled vector address. This should protect
4598 // from any edge cases where the stacksave result isn't directly used with
4599 // stackrestore.
4600 SDValue VectorAddress =
4601 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4602 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4603}
4604
4606 SelectionDAG &DAG) const {
4607 SDLoc SL(Op);
4608 assert(Op.getValueType() == MVT::i32);
4609
4610 uint32_t BothRoundHwReg =
4612 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4613
4614 SDValue IntrinID =
4615 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4616 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4617 Op.getOperand(0), IntrinID, GetRoundBothImm);
4618
4619 // There are two rounding modes, one for f32 and one for f64/f16. We only
4620 // report in the standard value range if both are the same.
4621 //
4622 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4623 // ties away from zero is not supported, and the other values are rotated by
4624 // 1.
4625 //
4626 // If the two rounding modes are not the same, report a target defined value.
4627
4628 // Mode register rounding mode fields:
4629 //
4630 // [1:0] Single-precision round mode.
4631 // [3:2] Double/Half-precision round mode.
4632 //
4633 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4634 //
4635 // Hardware Spec
4636 // Toward-0 3 0
4637 // Nearest Even 0 1
4638 // +Inf 1 2
4639 // -Inf 2 3
4640 // NearestAway0 N/A 4
4641 //
4642 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4643 // table we can index by the raw hardware mode.
4644 //
4645 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4646
4647 SDValue BitTable =
4649
4650 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4651 SDValue RoundModeTimesNumBits =
4652 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4653
4654 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4655 // knew only one mode was demanded.
4656 SDValue TableValue =
4657 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4658 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4659
4660 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4661 SDValue TableEntry =
4662 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4663
4664 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4665 // if it's an extended value.
4666 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4667 SDValue IsStandardValue =
4668 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4669 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4670 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4671 TableEntry, EnumOffset);
4672
4673 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4674}
4675
4677 SelectionDAG &DAG) const {
4678 SDLoc SL(Op);
4679
4680 SDValue NewMode = Op.getOperand(1);
4681 assert(NewMode.getValueType() == MVT::i32);
4682
4683 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4684 // hardware MODE.fp_round values.
4685 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4686 uint32_t ClampedVal = std::min(
4687 static_cast<uint32_t>(ConstMode->getZExtValue()),
4689 NewMode = DAG.getConstant(
4690 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4691 } else {
4692 // If we know the input can only be one of the supported standard modes in
4693 // the range 0-3, we can use a simplified mapping to hardware values.
4694 KnownBits KB = DAG.computeKnownBits(NewMode);
4695 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4696 // The supported standard values are 0-3. The extended values start at 8. We
4697 // need to offset by 4 if the value is in the extended range.
4698
4699 if (UseReducedTable) {
4700 // Truncate to the low 32-bits.
4701 SDValue BitTable = DAG.getConstant(
4702 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4703
4704 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4705 SDValue RoundModeTimesNumBits =
4706 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4707
4708 NewMode =
4709 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4710
4711 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4712 // the table extracted bits into inline immediates.
4713 } else {
4714 // table_index = umin(value, value - 4)
4715 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4716 SDValue BitTable =
4718
4719 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4720 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4721 SDValue IndexVal =
4722 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4723
4724 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4725 SDValue RoundModeTimesNumBits =
4726 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4727
4728 SDValue TableValue =
4729 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4730 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4731
4732 // No need to mask out the high bits since the setreg will ignore them
4733 // anyway.
4734 NewMode = TruncTable;
4735 }
4736
4737 // Insert a readfirstlane in case the value is a VGPR. We could do this
4738 // earlier and keep more operations scalar, but that interferes with
4739 // combining the source.
4740 SDValue ReadFirstLaneID =
4741 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4742 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4743 ReadFirstLaneID, NewMode);
4744 }
4745
4746 // N.B. The setreg will be later folded into s_round_mode on supported
4747 // targets.
4748 SDValue IntrinID =
4749 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4750 uint32_t BothRoundHwReg =
4752 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4753
4754 SDValue SetReg =
4755 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4756 IntrinID, RoundBothImm, NewMode);
4757
4758 return SetReg;
4759}
4760
4762 if (Op->isDivergent() &&
4763 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4764 // Cannot do I$ prefetch with divergent pointer.
4765 return SDValue();
4766
4767 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4771 break;
4773 if (Subtarget->hasSafeSmemPrefetch())
4774 break;
4775 [[fallthrough]];
4776 default:
4777 return SDValue();
4778 }
4779
4780 // I$ prefetch
4781 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4782 return SDValue();
4783
4784 return Op;
4785}
4786
4787// Work around DAG legality rules only based on the result type.
4789 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4790 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4791 EVT SrcVT = Src.getValueType();
4792
4793 if (SrcVT.getScalarType() != MVT::bf16)
4794 return Op;
4795
4796 SDLoc SL(Op);
4797 SDValue BitCast =
4798 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4799
4800 EVT DstVT = Op.getValueType();
4801 if (IsStrict)
4802 llvm_unreachable("Need STRICT_BF16_TO_FP");
4803
4804 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4805}
4806
4808 SDLoc SL(Op);
4809 if (Op.getValueType() != MVT::i64)
4810 return Op;
4811
4812 uint32_t ModeHwReg =
4814 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4815 uint32_t TrapHwReg =
4817 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4818
4819 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4820 SDValue IntrinID =
4821 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4822 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4823 Op.getOperand(0), IntrinID, ModeHwRegImm);
4824 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4825 Op.getOperand(0), IntrinID, TrapHwRegImm);
4826 SDValue TokenReg =
4827 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4828 GetTrapReg.getValue(1));
4829
4830 SDValue CvtPtr =
4831 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4832 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4833
4834 return DAG.getMergeValues({Result, TokenReg}, SL);
4835}
4836
4838 SDLoc SL(Op);
4839 if (Op.getOperand(1).getValueType() != MVT::i64)
4840 return Op;
4841
4842 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4843 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4844 DAG.getConstant(0, SL, MVT::i32));
4845 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4846 DAG.getConstant(1, SL, MVT::i32));
4847
4848 SDValue ReadFirstLaneID =
4849 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4850 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4851 ReadFirstLaneID, NewModeReg);
4852 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4853 ReadFirstLaneID, NewTrapReg);
4854
4855 unsigned ModeHwReg =
4857 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4858 unsigned TrapHwReg =
4860 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4861
4862 SDValue IntrinID =
4863 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4864 SDValue SetModeReg =
4865 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4866 IntrinID, ModeHwRegImm, NewModeReg);
4867 SDValue SetTrapReg =
4868 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4869 IntrinID, TrapHwRegImm, NewTrapReg);
4870 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4871}
4872
4874 const MachineFunction &MF) const {
4875 const Function &Fn = MF.getFunction();
4876
4878 .Case("m0", AMDGPU::M0)
4879 .Case("exec", AMDGPU::EXEC)
4880 .Case("exec_lo", AMDGPU::EXEC_LO)
4881 .Case("exec_hi", AMDGPU::EXEC_HI)
4882 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4883 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4884 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4885 .Default(Register());
4886 if (!Reg)
4887 return Reg;
4888
4889 if (!Subtarget->hasFlatScrRegister() &&
4890 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4891 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4892 "\" for subtarget."));
4893 }
4894
4895 switch (Reg) {
4896 case AMDGPU::M0:
4897 case AMDGPU::EXEC_LO:
4898 case AMDGPU::EXEC_HI:
4899 case AMDGPU::FLAT_SCR_LO:
4900 case AMDGPU::FLAT_SCR_HI:
4901 if (VT.getSizeInBits() == 32)
4902 return Reg;
4903 break;
4904 case AMDGPU::EXEC:
4905 case AMDGPU::FLAT_SCR:
4906 if (VT.getSizeInBits() == 64)
4907 return Reg;
4908 break;
4909 default:
4910 llvm_unreachable("missing register type checking");
4911 }
4912
4914 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4915}
4916
4917// If kill is not the last instruction, split the block so kill is always a
4918// proper terminator.
4921 MachineBasicBlock *BB) const {
4922 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4924 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4925 return SplitBB;
4926}
4927
4928// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4929// \p MI will be the only instruction in the loop body block. Otherwise, it will
4930// be the first instruction in the remainder block.
4931//
4932/// \returns { LoopBody, Remainder }
4933static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4935 MachineFunction *MF = MBB.getParent();
4937
4938 // To insert the loop we need to split the block. Move everything after this
4939 // point to a new block, and insert a new empty block between the two.
4941 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4943 ++MBBI;
4944
4945 MF->insert(MBBI, LoopBB);
4946 MF->insert(MBBI, RemainderBB);
4947
4948 LoopBB->addSuccessor(LoopBB);
4949 LoopBB->addSuccessor(RemainderBB);
4950
4951 // Move the rest of the block into a new block.
4952 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4953
4954 if (InstInLoop) {
4955 auto Next = std::next(I);
4956
4957 // Move instruction to loop body.
4958 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4959
4960 // Move the rest of the block.
4961 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4962 } else {
4963 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4964 }
4965
4966 MBB.addSuccessor(LoopBB);
4967
4968 return std::pair(LoopBB, RemainderBB);
4969}
4970
4971/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4973 MachineBasicBlock *MBB = MI.getParent();
4975 auto I = MI.getIterator();
4976 auto E = std::next(I);
4977
4978 // clang-format off
4979 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4980 .addImm(0);
4981 // clang-format on
4982
4983 MIBundleBuilder Bundler(*MBB, I, E);
4984 finalizeBundle(*MBB, Bundler.begin());
4985}
4986
4989 MachineBasicBlock *BB) const {
4990 const DebugLoc &DL = MI.getDebugLoc();
4991
4993
4995
4996 // Apparently kill flags are only valid if the def is in the same block?
4997 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4998 Src->setIsKill(false);
4999
5000 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5001
5002 MachineBasicBlock::iterator I = LoopBB->end();
5003
5004 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5006
5007 // Clear TRAP_STS.MEM_VIOL
5008 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5009 .addImm(0)
5010 .addImm(EncodedReg);
5011
5013
5014 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5015
5016 // Load and check TRAP_STS.MEM_VIOL
5017 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5018 .addImm(EncodedReg);
5019
5020 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5021 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5022 .addReg(Reg, RegState::Kill)
5023 .addImm(0);
5024 // clang-format off
5025 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5026 .addMBB(LoopBB);
5027 // clang-format on
5028
5029 return RemainderBB;
5030}
5031
5032// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5033// wavefront. If the value is uniform and just happens to be in a VGPR, this
5034// will only do one iteration. In the worst case, this will loop 64 times.
5035//
5036// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5039 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5040 const DebugLoc &DL, const MachineOperand &Idx,
5041 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5042 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5043 Register &SGPRIdxReg) {
5044
5045 MachineFunction *MF = OrigBB.getParent();
5046 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5047 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5050
5051 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5052 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5053 Register NewExec = MRI.createVirtualRegister(BoolRC);
5054 Register CurrentIdxReg =
5055 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5056 Register CondReg = MRI.createVirtualRegister(BoolRC);
5057
5058 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5059 .addReg(InitReg)
5060 .addMBB(&OrigBB)
5061 .addReg(ResultReg)
5062 .addMBB(&LoopBB);
5063
5064 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5065 .addReg(InitSaveExecReg)
5066 .addMBB(&OrigBB)
5067 .addReg(NewExec)
5068 .addMBB(&LoopBB);
5069
5070 // Read the next variant <- also loop target.
5071 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5072 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5073
5074 // Compare the just read M0 value to all possible Idx values.
5075 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5076 .addReg(CurrentIdxReg)
5077 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5078
5079 // Update EXEC, save the original EXEC value to VCC.
5080 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5081 .addReg(CondReg, RegState::Kill);
5082
5083 MRI.setSimpleHint(NewExec, CondReg);
5084
5085 if (UseGPRIdxMode) {
5086 if (Offset == 0) {
5087 SGPRIdxReg = CurrentIdxReg;
5088 } else {
5089 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5090 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5091 .addReg(CurrentIdxReg, RegState::Kill)
5092 .addImm(Offset);
5093 }
5094 } else {
5095 // Move index from VCC into M0
5096 if (Offset == 0) {
5097 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5098 .addReg(CurrentIdxReg, RegState::Kill);
5099 } else {
5100 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5101 .addReg(CurrentIdxReg, RegState::Kill)
5102 .addImm(Offset);
5103 }
5104 }
5105
5106 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5107 MachineInstr *InsertPt =
5108 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5109 .addReg(LMC.ExecReg)
5110 .addReg(NewExec);
5111
5112 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5113 // s_cbranch_scc0?
5114
5115 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5116 // clang-format off
5117 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5118 .addMBB(&LoopBB);
5119 // clang-format on
5120
5121 return InsertPt->getIterator();
5122}
5123
5124// This has slightly sub-optimal regalloc when the source vector is killed by
5125// the read. The register allocator does not understand that the kill is
5126// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5127// subregister from it, using 1 more VGPR than necessary. This was saved when
5128// this was expanded after register allocation.
5131 unsigned InitResultReg, unsigned PhiReg, int Offset,
5132 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5133 MachineFunction *MF = MBB.getParent();
5134 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5135 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5137 const DebugLoc &DL = MI.getDebugLoc();
5139
5140 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5141 Register DstReg = MI.getOperand(0).getReg();
5142 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5143 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5145
5146 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5147
5148 // Save the EXEC mask
5149 // clang-format off
5150 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5151 .addReg(LMC.ExecReg);
5152 // clang-format on
5153
5154 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5155
5156 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5157
5158 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5159 InitResultReg, DstReg, PhiReg, TmpExec,
5160 Offset, UseGPRIdxMode, SGPRIdxReg);
5161
5162 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5164 ++MBBI;
5165 MF->insert(MBBI, LandingPad);
5166 LoopBB->removeSuccessor(RemainderBB);
5167 LandingPad->addSuccessor(RemainderBB);
5168 LoopBB->addSuccessor(LandingPad);
5169 MachineBasicBlock::iterator First = LandingPad->begin();
5170 // clang-format off
5171 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5172 .addReg(SaveExec);
5173 // clang-format on
5174
5175 return InsPt;
5176}
5177
5178// Returns subreg index, offset
5179static std::pair<unsigned, int>
5181 const TargetRegisterClass *SuperRC, unsigned VecReg,
5182 int Offset) {
5183 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5184
5185 // Skip out of bounds offsets, or else we would end up using an undefined
5186 // register.
5187 if (Offset >= NumElts || Offset < 0)
5188 return std::pair(AMDGPU::sub0, Offset);
5189
5190 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5191}
5192
5195 int Offset) {
5196 MachineBasicBlock *MBB = MI.getParent();
5197 const DebugLoc &DL = MI.getDebugLoc();
5199
5200 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5201
5202 assert(Idx->getReg() != AMDGPU::NoRegister);
5203
5204 if (Offset == 0) {
5205 // clang-format off
5206 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5207 .add(*Idx);
5208 // clang-format on
5209 } else {
5210 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5211 .add(*Idx)
5212 .addImm(Offset);
5213 }
5214}
5215
5218 int Offset) {
5219 MachineBasicBlock *MBB = MI.getParent();
5220 const DebugLoc &DL = MI.getDebugLoc();
5222
5223 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5224
5225 if (Offset == 0)
5226 return Idx->getReg();
5227
5228 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5229 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5230 .add(*Idx)
5231 .addImm(Offset);
5232 return Tmp;
5233}
5234
5237 const GCNSubtarget &ST) {
5238 const SIInstrInfo *TII = ST.getInstrInfo();
5239 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5240 MachineFunction *MF = MBB.getParent();
5242
5243 Register Dst = MI.getOperand(0).getReg();
5244 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5245 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5246 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5247
5248 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5249 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5250
5251 unsigned SubReg;
5252 std::tie(SubReg, Offset) =
5253 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5254
5255 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5256
5257 // Check for a SGPR index.
5258 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5260 const DebugLoc &DL = MI.getDebugLoc();
5261
5262 if (UseGPRIdxMode) {
5263 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5264 // to avoid interfering with other uses, so probably requires a new
5265 // optimization pass.
5267
5268 const MCInstrDesc &GPRIDXDesc =
5269 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5270 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5271 .addReg(SrcReg)
5272 .addReg(Idx)
5273 .addImm(SubReg);
5274 } else {
5276
5277 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5278 .addReg(SrcReg, 0, SubReg)
5279 .addReg(SrcReg, RegState::Implicit);
5280 }
5281
5282 MI.eraseFromParent();
5283
5284 return &MBB;
5285 }
5286
5287 // Control flow needs to be inserted if indexing with a VGPR.
5288 const DebugLoc &DL = MI.getDebugLoc();
5290
5291 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5292 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5293
5294 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5295
5296 Register SGPRIdxReg;
5297 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5298 UseGPRIdxMode, SGPRIdxReg);
5299
5300 MachineBasicBlock *LoopBB = InsPt->getParent();
5301
5302 if (UseGPRIdxMode) {
5303 const MCInstrDesc &GPRIDXDesc =
5304 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5305
5306 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5307 .addReg(SrcReg)
5308 .addReg(SGPRIdxReg)
5309 .addImm(SubReg);
5310 } else {
5311 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5312 .addReg(SrcReg, 0, SubReg)
5313 .addReg(SrcReg, RegState::Implicit);
5314 }
5315
5316 MI.eraseFromParent();
5317
5318 return LoopBB;
5319}
5320
5323 const GCNSubtarget &ST) {
5324 const SIInstrInfo *TII = ST.getInstrInfo();
5325 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5326 MachineFunction *MF = MBB.getParent();
5328
5329 Register Dst = MI.getOperand(0).getReg();
5330 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5331 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5332 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5333 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5334 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5335 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5336
5337 // This can be an immediate, but will be folded later.
5338 assert(Val->getReg());
5339
5340 unsigned SubReg;
5341 std::tie(SubReg, Offset) =
5342 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5343 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5344
5345 if (Idx->getReg() == AMDGPU::NoRegister) {
5347 const DebugLoc &DL = MI.getDebugLoc();
5348
5349 assert(Offset == 0);
5350
5351 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5352 .add(*SrcVec)
5353 .add(*Val)
5354 .addImm(SubReg);
5355
5356 MI.eraseFromParent();
5357 return &MBB;
5358 }
5359
5360 // Check for a SGPR index.
5361 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5363 const DebugLoc &DL = MI.getDebugLoc();
5364
5365 if (UseGPRIdxMode) {
5367
5368 const MCInstrDesc &GPRIDXDesc =
5369 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5370 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5371 .addReg(SrcVec->getReg())
5372 .add(*Val)
5373 .addReg(Idx)
5374 .addImm(SubReg);
5375 } else {
5377
5378 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5379 TRI.getRegSizeInBits(*VecRC), 32, false);
5380 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5381 .addReg(SrcVec->getReg())
5382 .add(*Val)
5383 .addImm(SubReg);
5384 }
5385 MI.eraseFromParent();
5386 return &MBB;
5387 }
5388
5389 // Control flow needs to be inserted if indexing with a VGPR.
5390 if (Val->isReg())
5391 MRI.clearKillFlags(Val->getReg());
5392
5393 const DebugLoc &DL = MI.getDebugLoc();
5394
5395 Register PhiReg = MRI.createVirtualRegister(VecRC);
5396
5397 Register SGPRIdxReg;
5398 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5399 UseGPRIdxMode, SGPRIdxReg);
5400 MachineBasicBlock *LoopBB = InsPt->getParent();
5401
5402 if (UseGPRIdxMode) {
5403 const MCInstrDesc &GPRIDXDesc =
5404 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5405
5406 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5407 .addReg(PhiReg)
5408 .add(*Val)
5409 .addReg(SGPRIdxReg)
5410 .addImm(SubReg);
5411 } else {
5412 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5413 TRI.getRegSizeInBits(*VecRC), 32, false);
5414 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5415 .addReg(PhiReg)
5416 .add(*Val)
5417 .addImm(SubReg);
5418 }
5419
5420 MI.eraseFromParent();
5421 return LoopBB;
5422}
5423
5425 MachineBasicBlock *BB) {
5426 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5427 // For GFX12, we emit s_add_u64 and s_sub_u64.
5428 MachineFunction *MF = BB->getParent();
5429 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5430 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5432 const DebugLoc &DL = MI.getDebugLoc();
5433 MachineOperand &Dest = MI.getOperand(0);
5434 MachineOperand &Src0 = MI.getOperand(1);
5435 MachineOperand &Src1 = MI.getOperand(2);
5436 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5437 if (ST.hasScalarAddSub64()) {
5438 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5439 // clang-format off
5440 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5441 .add(Src0)
5442 .add(Src1);
5443 // clang-format on
5444 } else {
5445 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5446 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5447
5448 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5449 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5450
5451 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5452 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5453 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5454 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5455
5456 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5457 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5458 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5459 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5460
5461 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5462 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5463 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5464 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5465 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5466 .addReg(DestSub0)
5467 .addImm(AMDGPU::sub0)
5468 .addReg(DestSub1)
5469 .addImm(AMDGPU::sub1);
5470 }
5471 MI.eraseFromParent();
5472 return BB;
5473}
5474
5476 switch (Opc) {
5477 case AMDGPU::S_MIN_U32:
5478 return std::numeric_limits<uint32_t>::max();
5479 case AMDGPU::S_MIN_I32:
5480 return std::numeric_limits<int32_t>::max();
5481 case AMDGPU::S_MAX_U32:
5482 return std::numeric_limits<uint32_t>::min();
5483 case AMDGPU::S_MAX_I32:
5484 return std::numeric_limits<int32_t>::min();
5485 case AMDGPU::V_ADD_F32_e64: // -0.0
5486 return 0x80000000;
5487 case AMDGPU::V_SUB_F32_e64: // +0.0
5488 return 0x0;
5489 case AMDGPU::S_ADD_I32:
5490 case AMDGPU::S_SUB_I32:
5491 case AMDGPU::S_OR_B32:
5492 case AMDGPU::S_XOR_B32:
5493 return std::numeric_limits<uint32_t>::min();
5494 case AMDGPU::S_AND_B32:
5495 return std::numeric_limits<uint32_t>::max();
5496 case AMDGPU::V_MIN_F32_e64:
5497 case AMDGPU::V_MAX_F32_e64:
5498 return 0x7fc00000; // qNAN
5499 default:
5501 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5502 }
5503}
5504
5506 switch (Opc) {
5507 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5508 return std::numeric_limits<uint64_t>::max();
5509 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5510 return std::numeric_limits<int64_t>::max();
5511 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5512 return std::numeric_limits<uint64_t>::min();
5513 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5514 return std::numeric_limits<int64_t>::min();
5515 case AMDGPU::S_ADD_U64_PSEUDO:
5516 case AMDGPU::S_SUB_U64_PSEUDO:
5517 case AMDGPU::S_OR_B64:
5518 case AMDGPU::S_XOR_B64:
5519 return std::numeric_limits<uint64_t>::min();
5520 case AMDGPU::S_AND_B64:
5521 return std::numeric_limits<uint64_t>::max();
5522 default:
5524 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5525 }
5526}
5527
5528static bool is32bitWaveReduceOperation(unsigned Opc) {
5529 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5530 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5531 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5532 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5533 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5534 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5535 Opc == AMDGPU::V_SUB_F32_e64;
5536}
5537
5539 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5540 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
5541}
5542
5545 const GCNSubtarget &ST,
5546 unsigned Opc) {
5548 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5549 const DebugLoc &DL = MI.getDebugLoc();
5550 const SIInstrInfo *TII = ST.getInstrInfo();
5551
5552 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5553 Register SrcReg = MI.getOperand(1).getReg();
5554 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5555 Register DstReg = MI.getOperand(0).getReg();
5556 MachineBasicBlock *RetBB = nullptr;
5557 if (isSGPR) {
5558 switch (Opc) {
5559 case AMDGPU::S_MIN_U32:
5560 case AMDGPU::S_MIN_I32:
5561 case AMDGPU::V_MIN_F32_e64:
5562 case AMDGPU::S_MAX_U32:
5563 case AMDGPU::S_MAX_I32:
5564 case AMDGPU::V_MAX_F32_e64:
5565 case AMDGPU::S_AND_B32:
5566 case AMDGPU::S_OR_B32: {
5567 // Idempotent operations.
5568 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5569 RetBB = &BB;
5570 break;
5571 }
5572 case AMDGPU::V_CMP_LT_U64_e64: // umin
5573 case AMDGPU::V_CMP_LT_I64_e64: // min
5574 case AMDGPU::V_CMP_GT_U64_e64: // umax
5575 case AMDGPU::V_CMP_GT_I64_e64: // max
5576 case AMDGPU::S_AND_B64:
5577 case AMDGPU::S_OR_B64: {
5578 // Idempotent operations.
5579 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5580 RetBB = &BB;
5581 break;
5582 }
5583 case AMDGPU::S_XOR_B32:
5584 case AMDGPU::S_XOR_B64:
5585 case AMDGPU::S_ADD_I32:
5586 case AMDGPU::S_ADD_U64_PSEUDO:
5587 case AMDGPU::V_ADD_F32_e64:
5588 case AMDGPU::S_SUB_I32:
5589 case AMDGPU::S_SUB_U64_PSEUDO:
5590 case AMDGPU::V_SUB_F32_e64: {
5591 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5592 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5593 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5594 Register NumActiveLanes =
5595 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5596
5597 bool IsWave32 = ST.isWave32();
5598 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5599 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5600 unsigned BitCountOpc =
5601 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5602
5603 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5604
5605 auto NewAccumulator =
5606 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5607 .addReg(ExecMask);
5608
5609 switch (Opc) {
5610 case AMDGPU::S_XOR_B32:
5611 case AMDGPU::S_XOR_B64: {
5612 // Performing an XOR operation on a uniform value
5613 // depends on the parity of the number of active lanes.
5614 // For even parity, the result will be 0, for odd
5615 // parity the result will be the same as the input value.
5616 Register ParityRegister =
5617 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5618
5619 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5620 .addReg(NewAccumulator->getOperand(0).getReg())
5621 .addImm(1)
5622 .setOperandDead(3); // Dead scc
5623 if (Opc == AMDGPU::S_XOR_B32) {
5624 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5625 .addReg(SrcReg)
5626 .addReg(ParityRegister);
5627 } else {
5628 Register DestSub0 =
5629 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5630 Register DestSub1 =
5631 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5632
5633 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5634 const TargetRegisterClass *SrcSubRC =
5635 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5636
5637 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5638 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5639 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5640 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5641
5642 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5643 .add(Op1L)
5644 .addReg(ParityRegister);
5645
5646 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5647 .add(Op1H)
5648 .addReg(ParityRegister);
5649
5650 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5651 .addReg(DestSub0)
5652 .addImm(AMDGPU::sub0)
5653 .addReg(DestSub1)
5654 .addImm(AMDGPU::sub1);
5655 }
5656 break;
5657 }
5658 case AMDGPU::S_SUB_I32: {
5659 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5660
5661 // Take the negation of the source operand.
5662 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5663 .addImm(0)
5664 .addReg(SrcReg);
5665 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5666 .addReg(NegatedVal)
5667 .addReg(NewAccumulator->getOperand(0).getReg());
5668 break;
5669 }
5670 case AMDGPU::S_ADD_I32: {
5671 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5672 .addReg(SrcReg)
5673 .addReg(NewAccumulator->getOperand(0).getReg());
5674 break;
5675 }
5676 case AMDGPU::S_ADD_U64_PSEUDO:
5677 case AMDGPU::S_SUB_U64_PSEUDO: {
5678 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5679 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5680 Register Op1H_Op0L_Reg =
5681 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5682 Register Op1L_Op0H_Reg =
5683 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5684 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5685 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5686 Register NegatedValLo =
5687 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5688 Register NegatedValHi =
5689 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5690
5691 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5692 const TargetRegisterClass *Src1SubRC =
5693 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5694
5695 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5696 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5697 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5698 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5699
5700 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5701 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5702 .addImm(0)
5703 .addReg(NewAccumulator->getOperand(0).getReg())
5704 .setOperandDead(3); // Dead scc
5705 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5706 .addReg(NegatedValLo)
5707 .addImm(31)
5708 .setOperandDead(3); // Dead scc
5709 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5710 .add(Op1L)
5711 .addReg(NegatedValHi);
5712 }
5713 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5714 ? NegatedValLo
5715 : NewAccumulator->getOperand(0).getReg();
5716 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5717 .add(Op1L)
5718 .addReg(LowOpcode);
5719 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5720 .add(Op1L)
5721 .addReg(LowOpcode);
5722 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5723 .add(Op1H)
5724 .addReg(LowOpcode);
5725
5726 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5727 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5728 .addReg(CarryReg)
5729 .addReg(Op1H_Op0L_Reg)
5730 .setOperandDead(3); // Dead scc
5731
5732 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5733 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5734 .addReg(HiVal)
5735 .addReg(Op1L_Op0H_Reg)
5736 .setOperandDead(3); // Dead scc
5737 }
5738 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5739 .addReg(DestSub0)
5740 .addImm(AMDGPU::sub0)
5741 .addReg(DestSub1)
5742 .addImm(AMDGPU::sub1);
5743 break;
5744 }
5745 case AMDGPU::V_ADD_F32_e64:
5746 case AMDGPU::V_SUB_F32_e64: {
5747 Register ActiveLanesVreg =
5748 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5749 Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5750 // Get number of active lanes as a float val.
5751 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
5752 ActiveLanesVreg)
5753 .addReg(NewAccumulator->getOperand(0).getReg())
5754 .addImm(0) // clamp
5755 .addImm(0); // output-modifier
5756
5757 // Take negation of input for SUB reduction
5758 unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5759 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5760 .addImm(srcMod) // src0 modifier
5761 .addReg(SrcReg)
5762 .addImm(0) // src1 modifier
5763 .addReg(ActiveLanesVreg)
5764 .addImm(0) // clamp
5765 .addImm(0); // output-mod
5766 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5767 .addReg(DstVreg);
5768 }
5769 }
5770 RetBB = &BB;
5771 }
5772 }
5773 } else {
5774 // TODO: Implement DPP Strategy and switch based on immediate strategy
5775 // operand. For now, for all the cases (default, Iterative and DPP we use
5776 // iterative approach by default.)
5777
5778 // To reduce the VGPR using iterative approach, we need to iterate
5779 // over all the active lanes. Lowering consists of ComputeLoop,
5780 // which iterate over only active lanes. We use copy of EXEC register
5781 // as induction variable and every active lane modifies it using bitset0
5782 // so that we will get the next active lane for next iteration.
5784 Register SrcReg = MI.getOperand(1).getReg();
5785 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5787
5788 // Create Control flow for loop
5789 // Split MI's Machine Basic block into For loop
5790 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5791
5792 // Create virtual registers required for lowering.
5793 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5794 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5795 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5796 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5797 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5798 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5799 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5800 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5801 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5802
5803 bool IsWave32 = ST.isWave32();
5804 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5805 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5806
5807 // Create initial values of induction variable from Exec, Accumulator and
5808 // insert branch instr to newly created ComputeBlock
5809 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5810 if (is32BitOpc) {
5812 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5813 .addImm(IdentityValue);
5814 } else {
5816 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5817 .addImm(IdentityValue);
5818 }
5819 // clang-format off
5820 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5821 .addMBB(ComputeLoop);
5822 // clang-format on
5823
5824 // Start constructing ComputeLoop
5825 I = ComputeLoop->begin();
5826 auto Accumulator =
5827 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5828 .addReg(IdentityValReg)
5829 .addMBB(&BB);
5830 auto ActiveBits =
5831 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5832 .addReg(LoopIterator)
5833 .addMBB(&BB);
5834
5835 I = ComputeLoop->end();
5836 MachineInstr *NewAccumulator;
5837 // Perform the computations
5838 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5839 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5840 .addReg(ActiveBitsReg);
5841 if (is32BitOpc) {
5842 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5843 LaneValueReg)
5844 .addReg(SrcReg)
5845 .addReg(FF1Reg);
5846 if (isFPOp) {
5847 Register LaneValVreg =
5848 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5849 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5850 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5851 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5852 LaneValVreg)
5853 .addReg(LaneValueReg);
5854 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5855 .addImm(0) // src0 modifier
5856 .addReg(Accumulator->getOperand(0).getReg())
5857 .addImm(0) // src1 modifier
5858 .addReg(LaneValVreg)
5859 .addImm(0) // clamp
5860 .addImm(0); // omod
5861 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5862 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5863 .addReg(DstVreg);
5864 } else {
5865 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5866 .addReg(Accumulator->getOperand(0).getReg())
5867 .addReg(LaneValueReg);
5868 }
5869 } else {
5870 Register LaneValueLoReg =
5871 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5872 Register LaneValueHiReg =
5873 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5874 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5875 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5876 const TargetRegisterClass *SrcSubRC =
5877 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5878 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5879 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5880 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5881 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5882 // lane value input should be in an sgpr
5883 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5884 LaneValueLoReg)
5885 .add(Op1L)
5886 .addReg(FF1Reg);
5887 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5888 LaneValueHiReg)
5889 .add(Op1H)
5890 .addReg(FF1Reg);
5891 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5892 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5893 .addReg(LaneValueLoReg)
5894 .addImm(AMDGPU::sub0)
5895 .addReg(LaneValueHiReg)
5896 .addImm(AMDGPU::sub1);
5897 switch (Opc) {
5898 case AMDGPU::S_OR_B64:
5899 case AMDGPU::S_AND_B64:
5900 case AMDGPU::S_XOR_B64: {
5901 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5902 .addReg(Accumulator->getOperand(0).getReg())
5903 .addReg(LaneValue->getOperand(0).getReg())
5904 .setOperandDead(3); // Dead scc
5905 break;
5906 }
5907 case AMDGPU::V_CMP_GT_I64_e64:
5908 case AMDGPU::V_CMP_GT_U64_e64:
5909 case AMDGPU::V_CMP_LT_I64_e64:
5910 case AMDGPU::V_CMP_LT_U64_e64: {
5911 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5912 Register ComparisonResultReg =
5913 MRI.createVirtualRegister(WaveMaskRegClass);
5914 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5915 const TargetRegisterClass *VSubRegClass =
5916 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5917 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5918 MachineOperand SrcReg0Sub0 =
5919 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5920 VregClass, AMDGPU::sub0, VSubRegClass);
5921 MachineOperand SrcReg0Sub1 =
5922 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5923 VregClass, AMDGPU::sub1, VSubRegClass);
5924 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5925 AccumulatorVReg)
5926 .add(SrcReg0Sub0)
5927 .addImm(AMDGPU::sub0)
5928 .add(SrcReg0Sub1)
5929 .addImm(AMDGPU::sub1);
5930 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5931 .addReg(LaneValue->getOperand(0).getReg())
5932 .addReg(AccumulatorVReg);
5933
5934 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5935 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5936 .addReg(LaneMaskReg)
5937 .addReg(ActiveBitsReg);
5938
5939 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5940 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5941 .addReg(LaneValue->getOperand(0).getReg())
5942 .addReg(Accumulator->getOperand(0).getReg());
5943 break;
5944 }
5945 case AMDGPU::S_ADD_U64_PSEUDO:
5946 case AMDGPU::S_SUB_U64_PSEUDO: {
5947 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5948 .addReg(Accumulator->getOperand(0).getReg())
5949 .addReg(LaneValue->getOperand(0).getReg());
5950 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5951 break;
5952 }
5953 }
5954 }
5955 // Manipulate the iterator to get the next active lane
5956 unsigned BITSETOpc =
5957 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5958 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5959 .addReg(FF1Reg)
5960 .addReg(ActiveBitsReg);
5961
5962 // Add phi nodes
5963 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5964 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5965
5966 // Creating branching
5967 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5968 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5969 .addReg(NewActiveBitsReg)
5970 .addImm(0);
5971 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5972 .addMBB(ComputeLoop);
5973
5974 RetBB = ComputeEnd;
5975 }
5976 MI.eraseFromParent();
5977 return RetBB;
5978}
5979
5982 MachineBasicBlock *BB) const {
5983 MachineFunction *MF = BB->getParent();
5985 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5987 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5989 const DebugLoc &DL = MI.getDebugLoc();
5990
5991 switch (MI.getOpcode()) {
5992 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5993 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5994 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5995 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5996 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5997 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5998 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5999 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6000 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6001 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6002 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6003 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6004 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6005 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6006 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6007 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6008 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6009 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6010 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6011 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6012 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6013 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6014 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6015 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6016 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6017 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6018 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6019 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6020 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6021 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6022 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6023 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6024 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6025 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6026 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6027 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6028 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6029 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6030 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6031 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6032 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6033 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6034 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6035 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6036 case AMDGPU::S_UADDO_PSEUDO:
6037 case AMDGPU::S_USUBO_PSEUDO: {
6038 MachineOperand &Dest0 = MI.getOperand(0);
6039 MachineOperand &Dest1 = MI.getOperand(1);
6040 MachineOperand &Src0 = MI.getOperand(2);
6041 MachineOperand &Src1 = MI.getOperand(3);
6042
6043 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6044 ? AMDGPU::S_ADD_U32
6045 : AMDGPU::S_SUB_U32;
6046 // clang-format off
6047 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6048 .add(Src0)
6049 .add(Src1);
6050 // clang-format on
6051
6052 unsigned SelOpc =
6053 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6054 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6055
6056 MI.eraseFromParent();
6057 return BB;
6058 }
6059 case AMDGPU::S_ADD_U64_PSEUDO:
6060 case AMDGPU::S_SUB_U64_PSEUDO: {
6061 return Expand64BitScalarArithmetic(MI, BB);
6062 }
6063 case AMDGPU::V_ADD_U64_PSEUDO:
6064 case AMDGPU::V_SUB_U64_PSEUDO: {
6065 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6066
6067 MachineOperand &Dest = MI.getOperand(0);
6068 MachineOperand &Src0 = MI.getOperand(1);
6069 MachineOperand &Src1 = MI.getOperand(2);
6070
6071 if (ST.hasAddSubU64Insts()) {
6072 auto I = BuildMI(*BB, MI, DL,
6073 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6074 : AMDGPU::V_SUB_U64_e64),
6075 Dest.getReg())
6076 .add(Src0)
6077 .add(Src1)
6078 .addImm(0); // clamp
6079 TII->legalizeOperands(*I);
6080 MI.eraseFromParent();
6081 return BB;
6082 }
6083
6084 if (IsAdd && ST.hasLshlAddU64Inst()) {
6085 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6086 Dest.getReg())
6087 .add(Src0)
6088 .addImm(0)
6089 .add(Src1);
6090 TII->legalizeOperands(*Add);
6091 MI.eraseFromParent();
6092 return BB;
6093 }
6094
6095 const auto *CarryRC = TRI->getWaveMaskRegClass();
6096
6097 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6098 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6099
6100 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6101 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6102
6103 const TargetRegisterClass *Src0RC = Src0.isReg()
6104 ? MRI.getRegClass(Src0.getReg())
6105 : &AMDGPU::VReg_64RegClass;
6106 const TargetRegisterClass *Src1RC = Src1.isReg()
6107 ? MRI.getRegClass(Src1.getReg())
6108 : &AMDGPU::VReg_64RegClass;
6109
6110 const TargetRegisterClass *Src0SubRC =
6111 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6112 const TargetRegisterClass *Src1SubRC =
6113 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6114
6115 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6116 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6117 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6118 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6119
6120 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6121 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6122 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6123 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6124
6125 unsigned LoOpc =
6126 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6127 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6128 .addReg(CarryReg, RegState::Define)
6129 .add(SrcReg0Sub0)
6130 .add(SrcReg1Sub0)
6131 .addImm(0); // clamp bit
6132
6133 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6134 MachineInstr *HiHalf =
6135 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6136 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6137 .add(SrcReg0Sub1)
6138 .add(SrcReg1Sub1)
6139 .addReg(CarryReg, RegState::Kill)
6140 .addImm(0); // clamp bit
6141
6142 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6143 .addReg(DestSub0)
6144 .addImm(AMDGPU::sub0)
6145 .addReg(DestSub1)
6146 .addImm(AMDGPU::sub1);
6147 TII->legalizeOperands(*LoHalf);
6148 TII->legalizeOperands(*HiHalf);
6149 MI.eraseFromParent();
6150 return BB;
6151 }
6152 case AMDGPU::S_ADD_CO_PSEUDO:
6153 case AMDGPU::S_SUB_CO_PSEUDO: {
6154 // This pseudo has a chance to be selected
6155 // only from uniform add/subcarry node. All the VGPR operands
6156 // therefore assumed to be splat vectors.
6158 MachineOperand &Dest = MI.getOperand(0);
6159 MachineOperand &CarryDest = MI.getOperand(1);
6160 MachineOperand &Src0 = MI.getOperand(2);
6161 MachineOperand &Src1 = MI.getOperand(3);
6162 MachineOperand &Src2 = MI.getOperand(4);
6163 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6164 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6165 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6166 .addReg(Src0.getReg());
6167 Src0.setReg(RegOp0);
6168 }
6169 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6170 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6171 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6172 .addReg(Src1.getReg());
6173 Src1.setReg(RegOp1);
6174 }
6175 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6176 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6177 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6178 .addReg(Src2.getReg());
6179 Src2.setReg(RegOp2);
6180 }
6181
6182 if (ST.isWave64()) {
6183 if (ST.hasScalarCompareEq64()) {
6184 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6185 .addReg(Src2.getReg())
6186 .addImm(0);
6187 } else {
6188 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6189 const TargetRegisterClass *SubRC =
6190 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6191 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6192 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6193 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6194 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6195 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6196
6197 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6198 .add(Src2Sub0)
6199 .add(Src2Sub1);
6200
6201 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6202 .addReg(Src2_32, RegState::Kill)
6203 .addImm(0);
6204 }
6205 } else {
6206 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6207 .addReg(Src2.getReg())
6208 .addImm(0);
6209 }
6210
6211 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6212 ? AMDGPU::S_ADDC_U32
6213 : AMDGPU::S_SUBB_U32;
6214
6215 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6216
6217 unsigned SelOpc =
6218 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6219
6220 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6221 .addImm(-1)
6222 .addImm(0);
6223
6224 MI.eraseFromParent();
6225 return BB;
6226 }
6227 case AMDGPU::SI_INIT_M0: {
6228 MachineOperand &M0Init = MI.getOperand(0);
6229 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6230 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6231 AMDGPU::M0)
6232 .add(M0Init);
6233 MI.eraseFromParent();
6234 return BB;
6235 }
6236 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6237 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6238 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6239 TII->get(AMDGPU::S_CMP_EQ_U32))
6240 .addImm(0)
6241 .addImm(0);
6242 return BB;
6243 }
6244 case AMDGPU::GET_GROUPSTATICSIZE: {
6245 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6246 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6247 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6248 .add(MI.getOperand(0))
6249 .addImm(MFI->getLDSSize());
6250 MI.eraseFromParent();
6251 return BB;
6252 }
6253 case AMDGPU::GET_SHADERCYCLESHILO: {
6255 // The algorithm is:
6256 //
6257 // hi1 = getreg(SHADER_CYCLES_HI)
6258 // lo1 = getreg(SHADER_CYCLES_LO)
6259 // hi2 = getreg(SHADER_CYCLES_HI)
6260 //
6261 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6262 // Otherwise there was overflow and the result is hi2:0. In both cases the
6263 // result should represent the actual time at some point during the sequence
6264 // of three getregs.
6265 using namespace AMDGPU::Hwreg;
6266 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6267 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6268 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6269 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6270 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6271 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6272 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6273 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6274 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6275 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6276 .addReg(RegHi1)
6277 .addReg(RegHi2);
6278 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6279 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6280 .addReg(RegLo1)
6281 .addImm(0);
6282 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6283 .add(MI.getOperand(0))
6284 .addReg(RegLo)
6285 .addImm(AMDGPU::sub0)
6286 .addReg(RegHi2)
6287 .addImm(AMDGPU::sub1);
6288 MI.eraseFromParent();
6289 return BB;
6290 }
6291 case AMDGPU::SI_INDIRECT_SRC_V1:
6292 case AMDGPU::SI_INDIRECT_SRC_V2:
6293 case AMDGPU::SI_INDIRECT_SRC_V3:
6294 case AMDGPU::SI_INDIRECT_SRC_V4:
6295 case AMDGPU::SI_INDIRECT_SRC_V5:
6296 case AMDGPU::SI_INDIRECT_SRC_V6:
6297 case AMDGPU::SI_INDIRECT_SRC_V7:
6298 case AMDGPU::SI_INDIRECT_SRC_V8:
6299 case AMDGPU::SI_INDIRECT_SRC_V9:
6300 case AMDGPU::SI_INDIRECT_SRC_V10:
6301 case AMDGPU::SI_INDIRECT_SRC_V11:
6302 case AMDGPU::SI_INDIRECT_SRC_V12:
6303 case AMDGPU::SI_INDIRECT_SRC_V16:
6304 case AMDGPU::SI_INDIRECT_SRC_V32:
6305 return emitIndirectSrc(MI, *BB, *getSubtarget());
6306 case AMDGPU::SI_INDIRECT_DST_V1:
6307 case AMDGPU::SI_INDIRECT_DST_V2:
6308 case AMDGPU::SI_INDIRECT_DST_V3:
6309 case AMDGPU::SI_INDIRECT_DST_V4:
6310 case AMDGPU::SI_INDIRECT_DST_V5:
6311 case AMDGPU::SI_INDIRECT_DST_V6:
6312 case AMDGPU::SI_INDIRECT_DST_V7:
6313 case AMDGPU::SI_INDIRECT_DST_V8:
6314 case AMDGPU::SI_INDIRECT_DST_V9:
6315 case AMDGPU::SI_INDIRECT_DST_V10:
6316 case AMDGPU::SI_INDIRECT_DST_V11:
6317 case AMDGPU::SI_INDIRECT_DST_V12:
6318 case AMDGPU::SI_INDIRECT_DST_V16:
6319 case AMDGPU::SI_INDIRECT_DST_V32:
6320 return emitIndirectDst(MI, *BB, *getSubtarget());
6321 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6322 case AMDGPU::SI_KILL_I1_PSEUDO:
6323 return splitKillBlock(MI, BB);
6324 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6325 Register Dst = MI.getOperand(0).getReg();
6326 const MachineOperand &Src0 = MI.getOperand(1);
6327 const MachineOperand &Src1 = MI.getOperand(2);
6328 Register SrcCond = MI.getOperand(3).getReg();
6329
6330 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6331 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6332 const auto *CondRC = TRI->getWaveMaskRegClass();
6333 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6334
6335 const TargetRegisterClass *Src0RC = Src0.isReg()
6336 ? MRI.getRegClass(Src0.getReg())
6337 : &AMDGPU::VReg_64RegClass;
6338 const TargetRegisterClass *Src1RC = Src1.isReg()
6339 ? MRI.getRegClass(Src1.getReg())
6340 : &AMDGPU::VReg_64RegClass;
6341
6342 const TargetRegisterClass *Src0SubRC =
6343 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6344 const TargetRegisterClass *Src1SubRC =
6345 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6346
6347 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6348 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6349 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6350 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6351
6352 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6353 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6354 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6355 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6356
6357 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6358 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6359 .addImm(0)
6360 .add(Src0Sub0)
6361 .addImm(0)
6362 .add(Src1Sub0)
6363 .addReg(SrcCondCopy);
6364 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6365 .addImm(0)
6366 .add(Src0Sub1)
6367 .addImm(0)
6368 .add(Src1Sub1)
6369 .addReg(SrcCondCopy);
6370
6371 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6372 .addReg(DstLo)
6373 .addImm(AMDGPU::sub0)
6374 .addReg(DstHi)
6375 .addImm(AMDGPU::sub1);
6376 MI.eraseFromParent();
6377 return BB;
6378 }
6379 case AMDGPU::SI_BR_UNDEF: {
6380 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6381 .add(MI.getOperand(0));
6382 Br->getOperand(1).setIsUndef(); // read undef SCC
6383 MI.eraseFromParent();
6384 return BB;
6385 }
6386 case AMDGPU::ADJCALLSTACKUP:
6387 case AMDGPU::ADJCALLSTACKDOWN: {
6389 MachineInstrBuilder MIB(*MF, &MI);
6390 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6391 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6392 return BB;
6393 }
6394 case AMDGPU::SI_CALL_ISEL: {
6395 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6396
6398 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6399
6400 for (const MachineOperand &MO : MI.operands())
6401 MIB.add(MO);
6402
6403 MIB.cloneMemRefs(MI);
6404 MI.eraseFromParent();
6405 return BB;
6406 }
6407 case AMDGPU::V_ADD_CO_U32_e32:
6408 case AMDGPU::V_SUB_CO_U32_e32:
6409 case AMDGPU::V_SUBREV_CO_U32_e32: {
6410 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6411 unsigned Opc = MI.getOpcode();
6412
6413 bool NeedClampOperand = false;
6414 if (TII->pseudoToMCOpcode(Opc) == -1) {
6416 NeedClampOperand = true;
6417 }
6418
6419 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6420 if (TII->isVOP3(*I)) {
6421 I.addReg(TRI->getVCC(), RegState::Define);
6422 }
6423 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6424 if (NeedClampOperand)
6425 I.addImm(0); // clamp bit for e64 encoding
6426
6427 TII->legalizeOperands(*I);
6428
6429 MI.eraseFromParent();
6430 return BB;
6431 }
6432 case AMDGPU::V_ADDC_U32_e32:
6433 case AMDGPU::V_SUBB_U32_e32:
6434 case AMDGPU::V_SUBBREV_U32_e32:
6435 // These instructions have an implicit use of vcc which counts towards the
6436 // constant bus limit.
6437 TII->legalizeOperands(MI);
6438 return BB;
6439 case AMDGPU::DS_GWS_INIT:
6440 case AMDGPU::DS_GWS_SEMA_BR:
6441 case AMDGPU::DS_GWS_BARRIER:
6442 case AMDGPU::DS_GWS_SEMA_V:
6443 case AMDGPU::DS_GWS_SEMA_P:
6444 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6445 // A s_waitcnt 0 is required to be the instruction immediately following.
6446 if (getSubtarget()->hasGWSAutoReplay()) {
6448 return BB;
6449 }
6450
6451 return emitGWSMemViolTestLoop(MI, BB);
6452 case AMDGPU::S_SETREG_B32: {
6453 // Try to optimize cases that only set the denormal mode or rounding mode.
6454 //
6455 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6456 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6457 // instead.
6458 //
6459 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6460 // allow you to have a no side effect instruction in the output of a
6461 // sideeffecting pattern.
6462 auto [ID, Offset, Width] =
6463 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6465 return BB;
6466
6467 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6468 const unsigned SetMask = WidthMask << Offset;
6469
6470 if (getSubtarget()->hasDenormModeInst()) {
6471 unsigned SetDenormOp = 0;
6472 unsigned SetRoundOp = 0;
6473
6474 // The dedicated instructions can only set the whole denorm or round mode
6475 // at once, not a subset of bits in either.
6476 if (SetMask ==
6478 // If this fully sets both the round and denorm mode, emit the two
6479 // dedicated instructions for these.
6480 SetRoundOp = AMDGPU::S_ROUND_MODE;
6481 SetDenormOp = AMDGPU::S_DENORM_MODE;
6482 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6483 SetRoundOp = AMDGPU::S_ROUND_MODE;
6484 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6485 SetDenormOp = AMDGPU::S_DENORM_MODE;
6486 }
6487
6488 if (SetRoundOp || SetDenormOp) {
6489 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6490 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6491 unsigned ImmVal = Def->getOperand(1).getImm();
6492 if (SetRoundOp) {
6493 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6494 .addImm(ImmVal & 0xf);
6495
6496 // If we also have the denorm mode, get just the denorm mode bits.
6497 ImmVal >>= 4;
6498 }
6499
6500 if (SetDenormOp) {
6501 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6502 .addImm(ImmVal & 0xf);
6503 }
6504
6505 MI.eraseFromParent();
6506 return BB;
6507 }
6508 }
6509 }
6510
6511 // If only FP bits are touched, used the no side effects pseudo.
6512 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6513 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6514 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6515
6516 return BB;
6517 }
6518 case AMDGPU::S_INVERSE_BALLOT_U32:
6519 case AMDGPU::S_INVERSE_BALLOT_U64:
6520 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6521 // necessary. After that they are equivalent to a COPY.
6522 MI.setDesc(TII->get(AMDGPU::COPY));
6523 return BB;
6524 case AMDGPU::ENDPGM_TRAP: {
6525 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6526 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6527 MI.addOperand(MachineOperand::CreateImm(0));
6528 return BB;
6529 }
6530
6531 // We need a block split to make the real endpgm a terminator. We also don't
6532 // want to break phis in successor blocks, so we can't just delete to the
6533 // end of the block.
6534
6535 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6537 MF->push_back(TrapBB);
6538 // clang-format off
6539 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6540 .addImm(0);
6541 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6542 .addMBB(TrapBB);
6543 // clang-format on
6544
6545 BB->addSuccessor(TrapBB);
6546 MI.eraseFromParent();
6547 return SplitBB;
6548 }
6549 case AMDGPU::SIMULATED_TRAP: {
6550 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6551 MachineBasicBlock *SplitBB =
6552 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6553 MI.eraseFromParent();
6554 return SplitBB;
6555 }
6556 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6557 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6559
6560 // During ISel, it's difficult to propagate the original EXEC mask to use as
6561 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6562 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6563 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6564 Register OriginalExec = Setup->getOperand(0).getReg();
6565 MF->getRegInfo().clearKillFlags(OriginalExec);
6566 MI.getOperand(0).setReg(OriginalExec);
6567 return BB;
6568 }
6569 default:
6570 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6571 if (!MI.mayStore())
6573 return BB;
6574 }
6576 }
6577}
6578
6580 // This currently forces unfolding various combinations of fsub into fma with
6581 // free fneg'd operands. As long as we have fast FMA (controlled by
6582 // isFMAFasterThanFMulAndFAdd), we should perform these.
6583
6584 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6585 // most of these combines appear to be cycle neutral but save on instruction
6586 // count / code size.
6587 return true;
6588}
6589
6591
6593 EVT VT) const {
6594 if (!VT.isVector()) {
6595 return MVT::i1;
6596 }
6597 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6598}
6599
6601 // TODO: Should i16 be used always if legal? For now it would force VALU
6602 // shifts.
6603 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6604}
6605
6607 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6608 ? Ty.changeElementSize(16)
6609 : Ty.changeElementSize(32);
6610}
6611
6612// Answering this is somewhat tricky and depends on the specific device which
6613// have different rates for fma or all f64 operations.
6614//
6615// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6616// regardless of which device (although the number of cycles differs between
6617// devices), so it is always profitable for f64.
6618//
6619// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6620// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6621// which we can always do even without fused FP ops since it returns the same
6622// result as the separate operations and since it is always full
6623// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6624// however does not support denormals, so we do report fma as faster if we have
6625// a fast fma device and require denormals.
6626//
6628 EVT VT) const {
6629 VT = VT.getScalarType();
6630
6631 switch (VT.getSimpleVT().SimpleTy) {
6632 case MVT::f32: {
6633 // If mad is not available this depends only on if f32 fma is full rate.
6634 if (!Subtarget->hasMadMacF32Insts())
6635 return Subtarget->hasFastFMAF32();
6636
6637 // Otherwise f32 mad is always full rate and returns the same result as
6638 // the separate operations so should be preferred over fma.
6639 // However does not support denormals.
6641 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6642
6643 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6644 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6645 }
6646 case MVT::f64:
6647 return true;
6648 case MVT::f16:
6649 case MVT::bf16:
6650 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6651 default:
6652 break;
6653 }
6654
6655 return false;
6656}
6657
6659 LLT Ty) const {
6660 switch (Ty.getScalarSizeInBits()) {
6661 case 16:
6662 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6663 case 32:
6664 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6665 case 64:
6666 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6667 default:
6668 break;
6669 }
6670
6671 return false;
6672}
6673
6675 if (!Ty.isScalar())
6676 return false;
6677
6678 if (Ty.getScalarSizeInBits() == 16)
6679 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6680 if (Ty.getScalarSizeInBits() == 32)
6681 return Subtarget->hasMadMacF32Insts() &&
6682 denormalModeIsFlushAllF32(*MI.getMF());
6683
6684 return false;
6685}
6686
6688 const SDNode *N) const {
6689 // TODO: Check future ftz flag
6690 // v_mad_f32/v_mac_f32 do not support denormals.
6691 EVT VT = N->getValueType(0);
6692 if (VT == MVT::f32)
6693 return Subtarget->hasMadMacF32Insts() &&
6695 if (VT == MVT::f16) {
6696 return Subtarget->hasMadF16() &&
6698 }
6699
6700 return false;
6701}
6702
6703//===----------------------------------------------------------------------===//
6704// Custom DAG Lowering Operations
6705//===----------------------------------------------------------------------===//
6706
6707// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6708// wider vector type is legal.
6710 SelectionDAG &DAG) const {
6711 unsigned Opc = Op.getOpcode();
6712 EVT VT = Op.getValueType();
6713 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6714 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6715 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6716 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6717 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6718 VT == MVT::v32bf16);
6719
6720 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6721
6722 SDLoc SL(Op);
6723 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6724 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6725
6726 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6727}
6728
6729// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6730// regression whereby extra unnecessary instructions were added to codegen
6731// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6732// instructions to extract the result from the vector.
6734 [[maybe_unused]] EVT VT = Op.getValueType();
6735
6736 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6737 VT == MVT::v16i32) &&
6738 "Unexpected ValueType.");
6739
6740 return DAG.UnrollVectorOp(Op.getNode());
6741}
6742
6743// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6744// wider vector type is legal.
6746 SelectionDAG &DAG) const {
6747 unsigned Opc = Op.getOpcode();
6748 EVT VT = Op.getValueType();
6749 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6750 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6751 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6752 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6753 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6754 VT == MVT::v32bf16);
6755
6756 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6757 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6758
6759 SDLoc SL(Op);
6760
6761 SDValue OpLo =
6762 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6763 SDValue OpHi =
6764 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6765
6766 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6767}
6768
6770 SelectionDAG &DAG) const {
6771 unsigned Opc = Op.getOpcode();
6772 EVT VT = Op.getValueType();
6773 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6774 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6775 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6776 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6777 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6778 VT == MVT::v32bf16);
6779
6780 SDValue Op0 = Op.getOperand(0);
6781 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6782 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6783 : std::pair(Op0, Op0);
6784
6785 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6786 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6787
6788 SDLoc SL(Op);
6789 auto ResVT = DAG.GetSplitDestVTs(VT);
6790
6791 SDValue OpLo =
6792 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6793 SDValue OpHi =
6794 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6795
6796 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6797}
6798
6800 switch (Op.getOpcode()) {
6801 default:
6803 case ISD::BRCOND:
6804 return LowerBRCOND(Op, DAG);
6805 case ISD::RETURNADDR:
6806 return LowerRETURNADDR(Op, DAG);
6807 case ISD::LOAD: {
6808 SDValue Result = LowerLOAD(Op, DAG);
6809 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6810 "Load should return a value and a chain");
6811 return Result;
6812 }
6813 case ISD::FSQRT: {
6814 EVT VT = Op.getValueType();
6815 if (VT == MVT::f32)
6816 return lowerFSQRTF32(Op, DAG);
6817 if (VT == MVT::f64)
6818 return lowerFSQRTF64(Op, DAG);
6819 return SDValue();
6820 }
6821 case ISD::FSIN:
6822 case ISD::FCOS:
6823 return LowerTrig(Op, DAG);
6824 case ISD::SELECT:
6825 return LowerSELECT(Op, DAG);
6826 case ISD::FDIV:
6827 return LowerFDIV(Op, DAG);
6828 case ISD::FFREXP:
6829 return LowerFFREXP(Op, DAG);
6831 return LowerATOMIC_CMP_SWAP(Op, DAG);
6832 case ISD::STORE:
6833 return LowerSTORE(Op, DAG);
6834 case ISD::GlobalAddress: {
6837 return LowerGlobalAddress(MFI, Op, DAG);
6838 }
6840 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6842 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6844 return LowerINTRINSIC_VOID(Op, DAG);
6845 case ISD::ADDRSPACECAST:
6846 return lowerADDRSPACECAST(Op, DAG);
6848 return lowerINSERT_SUBVECTOR(Op, DAG);
6850 return lowerINSERT_VECTOR_ELT(Op, DAG);
6852 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6854 return lowerVECTOR_SHUFFLE(Op, DAG);
6856 return lowerSCALAR_TO_VECTOR(Op, DAG);
6857 case ISD::BUILD_VECTOR:
6858 return lowerBUILD_VECTOR(Op, DAG);
6859 case ISD::FP_ROUND:
6861 return lowerFP_ROUND(Op, DAG);
6862 case ISD::TRAP:
6863 return lowerTRAP(Op, DAG);
6864 case ISD::DEBUGTRAP:
6865 return lowerDEBUGTRAP(Op, DAG);
6866 case ISD::ABS:
6867 case ISD::FABS:
6868 case ISD::FNEG:
6869 case ISD::FCANONICALIZE:
6870 case ISD::BSWAP:
6871 return splitUnaryVectorOp(Op, DAG);
6872 case ISD::FMINNUM:
6873 case ISD::FMAXNUM:
6874 return lowerFMINNUM_FMAXNUM(Op, DAG);
6875 case ISD::FMINIMUMNUM:
6876 case ISD::FMAXIMUMNUM:
6877 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6878 case ISD::FMINIMUM:
6879 case ISD::FMAXIMUM:
6880 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6881 case ISD::FLDEXP:
6882 case ISD::STRICT_FLDEXP:
6883 return lowerFLDEXP(Op, DAG);
6884 case ISD::FMA:
6885 return splitTernaryVectorOp(Op, DAG);
6886 case ISD::FP_TO_SINT:
6887 case ISD::FP_TO_UINT:
6888 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
6889 Op.getValueType() == MVT::i16 &&
6890 Op.getOperand(0).getValueType() == MVT::f32) {
6891 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
6892 return Op;
6893 }
6894 return LowerFP_TO_INT(Op, DAG);
6895 case ISD::SHL:
6896 case ISD::SRA:
6897 case ISD::SRL:
6898 case ISD::ADD:
6899 case ISD::SUB:
6900 case ISD::SMIN:
6901 case ISD::SMAX:
6902 case ISD::UMIN:
6903 case ISD::UMAX:
6904 case ISD::FADD:
6905 case ISD::FMUL:
6906 case ISD::FMINNUM_IEEE:
6907 case ISD::FMAXNUM_IEEE:
6908 case ISD::UADDSAT:
6909 case ISD::USUBSAT:
6910 case ISD::SADDSAT:
6911 case ISD::SSUBSAT:
6912 return splitBinaryVectorOp(Op, DAG);
6913 case ISD::FCOPYSIGN:
6914 return lowerFCOPYSIGN(Op, DAG);
6915 case ISD::MUL:
6916 return lowerMUL(Op, DAG);
6917 case ISD::SMULO:
6918 case ISD::UMULO:
6919 return lowerXMULO(Op, DAG);
6920 case ISD::SMUL_LOHI:
6921 case ISD::UMUL_LOHI:
6922 return lowerXMUL_LOHI(Op, DAG);
6924 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6925 case ISD::STACKSAVE:
6926 return LowerSTACKSAVE(Op, DAG);
6927 case ISD::GET_ROUNDING:
6928 return lowerGET_ROUNDING(Op, DAG);
6929 case ISD::SET_ROUNDING:
6930 return lowerSET_ROUNDING(Op, DAG);
6931 case ISD::PREFETCH:
6932 return lowerPREFETCH(Op, DAG);
6933 case ISD::FP_EXTEND:
6935 return lowerFP_EXTEND(Op, DAG);
6936 case ISD::GET_FPENV:
6937 return lowerGET_FPENV(Op, DAG);
6938 case ISD::SET_FPENV:
6939 return lowerSET_FPENV(Op, DAG);
6940 case ISD::ROTR:
6941 return lowerROTR(Op, DAG);
6942 }
6943 return SDValue();
6944}
6945
6946// Used for D16: Casts the result of an instruction into the right vector,
6947// packs values if loads return unpacked values.
6949 const SDLoc &DL, SelectionDAG &DAG,
6950 bool Unpacked) {
6951 if (!LoadVT.isVector())
6952 return Result;
6953
6954 // Cast back to the original packed type or to a larger type that is a
6955 // multiple of 32 bit for D16. Widening the return type is a required for
6956 // legalization.
6957 EVT FittingLoadVT = LoadVT;
6958 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6959 FittingLoadVT =
6961 LoadVT.getVectorNumElements() + 1);
6962 }
6963
6964 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6965 // Truncate to v2i16/v4i16.
6966 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6967
6968 // Workaround legalizer not scalarizing truncate after vector op
6969 // legalization but not creating intermediate vector trunc.
6971 DAG.ExtractVectorElements(Result, Elts);
6972 for (SDValue &Elt : Elts)
6973 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6974
6975 // Pad illegal v1i16/v3fi6 to v4i16
6976 if ((LoadVT.getVectorNumElements() % 2) == 1)
6977 Elts.push_back(DAG.getPOISON(MVT::i16));
6978
6979 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6980
6981 // Bitcast to original type (v2f16/v4f16).
6982 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6983 }
6984
6985 // Cast back to the original packed type.
6986 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6987}
6988
6989SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6990 SelectionDAG &DAG,
6992 bool IsIntrinsic) const {
6993 SDLoc DL(M);
6994
6995 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6996 EVT LoadVT = M->getValueType(0);
6997
6998 EVT EquivLoadVT = LoadVT;
6999 if (LoadVT.isVector()) {
7000 if (Unpacked) {
7001 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7002 LoadVT.getVectorNumElements());
7003 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7004 // Widen v3f16 to legal type
7005 EquivLoadVT =
7007 LoadVT.getVectorNumElements() + 1);
7008 }
7009 }
7010
7011 // Change from v4f16/v2f16 to EquivLoadVT.
7012 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7013
7015 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7016 M->getMemoryVT(), M->getMemOperand());
7017
7018 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7019
7020 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7021}
7022
7023SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7024 SelectionDAG &DAG,
7025 ArrayRef<SDValue> Ops) const {
7026 SDLoc DL(M);
7027 EVT LoadVT = M->getValueType(0);
7028 EVT EltType = LoadVT.getScalarType();
7029 EVT IntVT = LoadVT.changeTypeToInteger();
7030
7031 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7032
7033 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7034 bool IsTFE = M->getNumValues() == 3;
7035
7036 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7037 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7038 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7039 : AMDGPUISD::BUFFER_LOAD;
7040
7041 if (IsD16) {
7042 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7043 }
7044
7045 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7046 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7047 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7048 IsTFE);
7049
7050 if (isTypeLegal(LoadVT)) {
7051 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7052 M->getMemOperand(), DAG);
7053 }
7054
7055 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7056 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7057 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7058 M->getMemOperand(), DAG);
7059 return DAG.getMergeValues(
7060 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7061 DL);
7062}
7063
7065 SelectionDAG &DAG) {
7066 EVT VT = N->getValueType(0);
7067 unsigned CondCode = N->getConstantOperandVal(3);
7068 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7069 return DAG.getPOISON(VT);
7070
7071 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7072
7073 SDValue LHS = N->getOperand(1);
7074 SDValue RHS = N->getOperand(2);
7075
7076 SDLoc DL(N);
7077
7078 EVT CmpVT = LHS.getValueType();
7079 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7080 unsigned PromoteOp =
7082 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7083 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7084 }
7085
7086 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7087
7088 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7089 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7090
7091 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7092 DAG.getCondCode(CCOpcode));
7093 if (VT.bitsEq(CCVT))
7094 return SetCC;
7095 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7096}
7097
7099 SelectionDAG &DAG) {
7100 EVT VT = N->getValueType(0);
7101
7102 unsigned CondCode = N->getConstantOperandVal(3);
7103 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7104 return DAG.getPOISON(VT);
7105
7106 SDValue Src0 = N->getOperand(1);
7107 SDValue Src1 = N->getOperand(2);
7108 EVT CmpVT = Src0.getValueType();
7109 SDLoc SL(N);
7110
7111 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7112 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7113 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7114 }
7115
7116 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7117 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7118 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7119 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7120 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7121 DAG.getCondCode(CCOpcode));
7122 if (VT.bitsEq(CCVT))
7123 return SetCC;
7124 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7125}
7126
7128 SelectionDAG &DAG) {
7129 EVT VT = N->getValueType(0);
7130 SDValue Src = N->getOperand(1);
7131 SDLoc SL(N);
7132
7133 if (Src.getOpcode() == ISD::SETCC) {
7134 SDValue Op0 = Src.getOperand(0);
7135 SDValue Op1 = Src.getOperand(1);
7136 // Need to expand bfloat to float for comparison (setcc).
7137 if (Op0.getValueType() == MVT::bf16) {
7138 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7139 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7140 }
7141 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7142 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7143 }
7144 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7145 // (ballot 0) -> 0
7146 if (Arg->isZero())
7147 return DAG.getConstant(0, SL, VT);
7148
7149 // (ballot 1) -> EXEC/EXEC_LO
7150 if (Arg->isOne()) {
7151 Register Exec;
7152 if (VT.getScalarSizeInBits() == 32)
7153 Exec = AMDGPU::EXEC_LO;
7154 else if (VT.getScalarSizeInBits() == 64)
7155 Exec = AMDGPU::EXEC;
7156 else
7157 return SDValue();
7158
7159 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7160 }
7161 }
7162
7163 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7164 // ISD::SETNE)
7165 return DAG.getNode(
7166 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7167 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7168}
7169
7171 SelectionDAG &DAG) {
7172 EVT VT = N->getValueType(0);
7173 unsigned ValSize = VT.getSizeInBits();
7174 unsigned IID = N->getConstantOperandVal(0);
7175 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7176 IID == Intrinsic::amdgcn_permlanex16;
7177 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7178 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7179 SDLoc SL(N);
7180 MVT IntVT = MVT::getIntegerVT(ValSize);
7181 const GCNSubtarget *ST = TLI.getSubtarget();
7182 unsigned SplitSize = 32;
7183 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7184 ST->hasDPALU_DPP() &&
7185 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7186 SplitSize = 64;
7187
7188 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7189 SDValue Src2, MVT ValT) -> SDValue {
7190 SmallVector<SDValue, 8> Operands;
7191 switch (IID) {
7192 case Intrinsic::amdgcn_permlane16:
7193 case Intrinsic::amdgcn_permlanex16:
7194 case Intrinsic::amdgcn_update_dpp:
7195 Operands.push_back(N->getOperand(6));
7196 Operands.push_back(N->getOperand(5));
7197 Operands.push_back(N->getOperand(4));
7198 [[fallthrough]];
7199 case Intrinsic::amdgcn_writelane:
7200 Operands.push_back(Src2);
7201 [[fallthrough]];
7202 case Intrinsic::amdgcn_readlane:
7203 case Intrinsic::amdgcn_set_inactive:
7204 case Intrinsic::amdgcn_set_inactive_chain_arg:
7205 case Intrinsic::amdgcn_mov_dpp8:
7206 Operands.push_back(Src1);
7207 [[fallthrough]];
7208 case Intrinsic::amdgcn_readfirstlane:
7209 case Intrinsic::amdgcn_permlane64:
7210 Operands.push_back(Src0);
7211 break;
7212 default:
7213 llvm_unreachable("unhandled lane op");
7214 }
7215
7216 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7217 std::reverse(Operands.begin(), Operands.end());
7218
7219 if (SDNode *GL = N->getGluedNode()) {
7220 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7221 GL = GL->getOperand(0).getNode();
7222 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7223 SDValue(GL, 0)));
7224 }
7225
7226 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7227 };
7228
7229 SDValue Src0 = N->getOperand(1);
7230 SDValue Src1, Src2;
7231 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7232 IID == Intrinsic::amdgcn_mov_dpp8 ||
7233 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7234 Src1 = N->getOperand(2);
7235 if (IID == Intrinsic::amdgcn_writelane ||
7236 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7237 Src2 = N->getOperand(3);
7238 }
7239
7240 if (ValSize == SplitSize) {
7241 // Already legal
7242 return SDValue();
7243 }
7244
7245 if (ValSize < 32) {
7246 bool IsFloat = VT.isFloatingPoint();
7247 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7248 SL, MVT::i32);
7249
7250 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7251 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7252 SL, MVT::i32);
7253 }
7254
7255 if (IID == Intrinsic::amdgcn_writelane) {
7256 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7257 SL, MVT::i32);
7258 }
7259
7260 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7261 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7262 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7263 }
7264
7265 if (ValSize % SplitSize != 0)
7266 return SDValue();
7267
7268 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7269 EVT VT = N->getValueType(0);
7270 unsigned NE = VT.getVectorNumElements();
7271 EVT EltVT = VT.getVectorElementType();
7273 unsigned NumOperands = N->getNumOperands();
7274 SmallVector<SDValue, 4> Operands(NumOperands);
7275 SDNode *GL = N->getGluedNode();
7276
7277 // only handle convergencectrl_glue
7279
7280 for (unsigned i = 0; i != NE; ++i) {
7281 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7282 ++j) {
7283 SDValue Operand = N->getOperand(j);
7284 EVT OperandVT = Operand.getValueType();
7285 if (OperandVT.isVector()) {
7286 // A vector operand; extract a single element.
7287 EVT OperandEltVT = OperandVT.getVectorElementType();
7288 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7289 Operand, DAG.getVectorIdxConstant(i, SL));
7290 } else {
7291 // A scalar operand; just use it as is.
7292 Operands[j] = Operand;
7293 }
7294 }
7295
7296 if (GL)
7297 Operands[NumOperands - 1] =
7298 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7299 SDValue(GL->getOperand(0).getNode(), 0));
7300
7301 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7302 }
7303
7304 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7305 return DAG.getBuildVector(VecVT, SL, Scalars);
7306 };
7307
7308 if (VT.isVector()) {
7309 switch (MVT::SimpleValueType EltTy =
7311 case MVT::i32:
7312 case MVT::f32:
7313 if (SplitSize == 32) {
7314 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7315 return unrollLaneOp(LaneOp.getNode());
7316 }
7317 [[fallthrough]];
7318 case MVT::i16:
7319 case MVT::f16:
7320 case MVT::bf16: {
7321 unsigned SubVecNumElt =
7322 SplitSize / VT.getVectorElementType().getSizeInBits();
7323 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7325 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7326 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7327 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7328 DAG.getConstant(EltIdx, SL, MVT::i32));
7329
7330 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7331 IsPermLane16)
7332 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7333 DAG.getConstant(EltIdx, SL, MVT::i32));
7334
7335 if (IID == Intrinsic::amdgcn_writelane)
7336 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7337 DAG.getConstant(EltIdx, SL, MVT::i32));
7338
7339 Pieces.push_back(
7340 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7341 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7342 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7343 EltIdx += SubVecNumElt;
7344 }
7345 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7346 }
7347 default:
7348 // Handle all other cases by bitcasting to i32 vectors
7349 break;
7350 }
7351 }
7352
7353 MVT VecVT =
7354 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7355 Src0 = DAG.getBitcast(VecVT, Src0);
7356
7357 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7358 Src1 = DAG.getBitcast(VecVT, Src1);
7359
7360 if (IID == Intrinsic::amdgcn_writelane)
7361 Src2 = DAG.getBitcast(VecVT, Src2);
7362
7363 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7364 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7365 return DAG.getBitcast(VT, UnrolledLaneOp);
7366}
7367
7370 SelectionDAG &DAG) const {
7371 switch (N->getOpcode()) {
7373 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7374 Results.push_back(Res);
7375 return;
7376 }
7378 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7379 Results.push_back(Res);
7380 return;
7381 }
7383 unsigned IID = N->getConstantOperandVal(0);
7384 switch (IID) {
7385 case Intrinsic::amdgcn_make_buffer_rsrc:
7386 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7387 return;
7388 case Intrinsic::amdgcn_cvt_pkrtz: {
7389 SDValue Src0 = N->getOperand(1);
7390 SDValue Src1 = N->getOperand(2);
7391 SDLoc SL(N);
7392 SDValue Cvt =
7393 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7394 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7395 return;
7396 }
7397 case Intrinsic::amdgcn_cvt_pknorm_i16:
7398 case Intrinsic::amdgcn_cvt_pknorm_u16:
7399 case Intrinsic::amdgcn_cvt_pk_i16:
7400 case Intrinsic::amdgcn_cvt_pk_u16: {
7401 SDValue Src0 = N->getOperand(1);
7402 SDValue Src1 = N->getOperand(2);
7403 SDLoc SL(N);
7404 unsigned Opcode;
7405
7406 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7407 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7408 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7409 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7410 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7411 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7412 else
7413 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7414
7415 EVT VT = N->getValueType(0);
7416 if (isTypeLegal(VT))
7417 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7418 else {
7419 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7420 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7421 }
7422 return;
7423 }
7424 case Intrinsic::amdgcn_s_buffer_load: {
7425 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7426 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7427 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7428 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7429 // s_buffer_load_i8.
7430 if (!Subtarget->hasScalarSubwordLoads())
7431 return;
7432 SDValue Op = SDValue(N, 0);
7433 SDValue Rsrc = Op.getOperand(1);
7434 SDValue Offset = Op.getOperand(2);
7435 SDValue CachePolicy = Op.getOperand(3);
7436 EVT VT = Op.getValueType();
7437 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7438 SDLoc DL(Op);
7440 const DataLayout &DataLayout = DAG.getDataLayout();
7441 Align Alignment =
7447 VT.getStoreSize(), Alignment);
7448 SDValue LoadVal;
7449 if (!Offset->isDivergent()) {
7450 SDValue Ops[] = {Rsrc, // source register
7451 Offset, CachePolicy};
7452 SDValue BufferLoad =
7453 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7454 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7455 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7456 } else {
7457 SDValue Ops[] = {
7458 DAG.getEntryNode(), // Chain
7459 Rsrc, // rsrc
7460 DAG.getConstant(0, DL, MVT::i32), // vindex
7461 {}, // voffset
7462 {}, // soffset
7463 {}, // offset
7464 CachePolicy, // cachepolicy
7465 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7466 };
7467 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7468 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7469 }
7470 Results.push_back(LoadVal);
7471 return;
7472 }
7473 case Intrinsic::amdgcn_dead: {
7474 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7475 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7476 return;
7477 }
7478 }
7479 break;
7480 }
7482 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7483 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7484 // FIXME: Hacky
7485 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7486 Results.push_back(Res.getOperand(I));
7487 }
7488 } else {
7489 Results.push_back(Res);
7490 Results.push_back(Res.getValue(1));
7491 }
7492 return;
7493 }
7494
7495 break;
7496 }
7497 case ISD::SELECT: {
7498 SDLoc SL(N);
7499 EVT VT = N->getValueType(0);
7500 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7501 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7502 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7503
7504 EVT SelectVT = NewVT;
7505 if (NewVT.bitsLT(MVT::i32)) {
7506 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7507 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7508 SelectVT = MVT::i32;
7509 }
7510
7511 SDValue NewSelect =
7512 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7513
7514 if (NewVT != SelectVT)
7515 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7516 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7517 return;
7518 }
7519 case ISD::FNEG: {
7520 if (N->getValueType(0) != MVT::v2f16)
7521 break;
7522
7523 SDLoc SL(N);
7524 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7525
7526 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7527 DAG.getConstant(0x80008000, SL, MVT::i32));
7528 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7529 return;
7530 }
7531 case ISD::FABS: {
7532 if (N->getValueType(0) != MVT::v2f16)
7533 break;
7534
7535 SDLoc SL(N);
7536 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7537
7538 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7539 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7540 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7541 return;
7542 }
7543 case ISD::FSQRT: {
7544 if (N->getValueType(0) != MVT::f16)
7545 break;
7546 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7547 break;
7548 }
7549 default:
7551 break;
7552 }
7553}
7554
7555/// Helper function for LowerBRCOND
7556static SDNode *findUser(SDValue Value, unsigned Opcode) {
7557
7558 for (SDUse &U : Value->uses()) {
7559 if (U.get() != Value)
7560 continue;
7561
7562 if (U.getUser()->getOpcode() == Opcode)
7563 return U.getUser();
7564 }
7565 return nullptr;
7566}
7567
7568unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7569 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7570 switch (Intr->getConstantOperandVal(1)) {
7571 case Intrinsic::amdgcn_if:
7572 return AMDGPUISD::IF;
7573 case Intrinsic::amdgcn_else:
7574 return AMDGPUISD::ELSE;
7575 case Intrinsic::amdgcn_loop:
7576 return AMDGPUISD::LOOP;
7577 case Intrinsic::amdgcn_end_cf:
7578 llvm_unreachable("should not occur");
7579 default:
7580 return 0;
7581 }
7582 }
7583
7584 // break, if_break, else_break are all only used as inputs to loop, not
7585 // directly as branch conditions.
7586 return 0;
7587}
7588
7595
7597 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7598 return false;
7599
7600 // FIXME: Either avoid relying on address space here or change the default
7601 // address space for functions to avoid the explicit check.
7602 return (GV->getValueType()->isFunctionTy() ||
7605}
7606
7608 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7609}
7610
7612 if (!GV->hasExternalLinkage())
7613 return true;
7614
7615 const auto OS = getTargetMachine().getTargetTriple().getOS();
7616 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7617}
7618
7619/// This transforms the control flow intrinsics to get the branch destination as
7620/// last parameter, also switches branch target with BR if the need arise
7621SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7622 SDLoc DL(BRCOND);
7623
7624 SDNode *Intr = BRCOND.getOperand(1).getNode();
7625 SDValue Target = BRCOND.getOperand(2);
7626 SDNode *BR = nullptr;
7627 SDNode *SetCC = nullptr;
7628
7629 switch (Intr->getOpcode()) {
7630 case ISD::SETCC: {
7631 // As long as we negate the condition everything is fine
7632 SetCC = Intr;
7633 Intr = SetCC->getOperand(0).getNode();
7634 break;
7635 }
7636 case ISD::XOR: {
7637 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7638 SDValue LHS = Intr->getOperand(0);
7639 SDValue RHS = Intr->getOperand(1);
7640 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7641 Intr = LHS.getNode();
7642 break;
7643 }
7644 [[fallthrough]];
7645 }
7646 default: {
7647 // Get the target from BR if we don't negate the condition
7648 BR = findUser(BRCOND, ISD::BR);
7649 assert(BR && "brcond missing unconditional branch user");
7650 Target = BR->getOperand(1);
7651 }
7652 }
7653
7654 unsigned CFNode = isCFIntrinsic(Intr);
7655 if (CFNode == 0) {
7656 // This is a uniform branch so we don't need to legalize.
7657 return BRCOND;
7658 }
7659
7660 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7662
7663 assert(!SetCC ||
7664 (SetCC->getConstantOperandVal(1) == 1 &&
7665 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7666 ISD::SETNE));
7667
7668 // operands of the new intrinsic call
7670 if (HaveChain)
7671 Ops.push_back(BRCOND.getOperand(0));
7672
7673 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7674 Ops.push_back(Target);
7675
7676 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7677
7678 // build the new intrinsic call
7679 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7680
7681 if (!HaveChain) {
7682 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7683
7685 }
7686
7687 if (BR) {
7688 // Give the branch instruction our target
7689 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7690 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7691 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7692 }
7693
7694 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7695
7696 // Copy the intrinsic results to registers
7697 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7698 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7699 if (!CopyToReg)
7700 continue;
7701
7702 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7703 SDValue(Result, i - 1), SDValue());
7704
7705 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7706 }
7707
7708 // Remove the old intrinsic from the chain
7709 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7710 Intr->getOperand(0));
7711
7712 return Chain;
7713}
7714
7715SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7716 MVT VT = Op.getSimpleValueType();
7717 SDLoc DL(Op);
7718 // Checking the depth
7719 if (Op.getConstantOperandVal(0) != 0)
7720 return DAG.getConstant(0, DL, VT);
7721
7722 MachineFunction &MF = DAG.getMachineFunction();
7723 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7724 // Check for kernel and shader functions
7725 if (Info->isEntryFunction())
7726 return DAG.getConstant(0, DL, VT);
7727
7728 MachineFrameInfo &MFI = MF.getFrameInfo();
7729 // There is a call to @llvm.returnaddress in this function
7730 MFI.setReturnAddressIsTaken(true);
7731
7732 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7733 // Get the return address reg and mark it as an implicit live-in
7734 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7735 getRegClassFor(VT, Op.getNode()->isDivergent()));
7736
7737 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7738}
7739
7740SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7741 const SDLoc &DL, EVT VT) const {
7742 return Op.getValueType().bitsLE(VT)
7743 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7744 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7745 DAG.getTargetConstant(0, DL, MVT::i32));
7746}
7747
7748SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7749 SelectionDAG &DAG) const {
7750 EVT DstVT = Op.getValueType();
7751 unsigned NumElts = DstVT.getVectorNumElements();
7752 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7753
7754 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7755
7756 SDLoc DL(Op);
7757 unsigned Opc = Op.getOpcode();
7758 SDValue Flags = Op.getOperand(1);
7759 EVT HalfDstVT =
7760 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7761 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7762 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7763
7764 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7765}
7766
7767SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7768 SDValue Src = Op.getOperand(0);
7769 EVT SrcVT = Src.getValueType();
7770 EVT DstVT = Op.getValueType();
7771
7772 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7773 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7774 if (SrcVT.getScalarType() != MVT::f32)
7775 return SDValue();
7776 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7777 }
7778
7779 if (SrcVT.getScalarType() != MVT::f64)
7780 return Op;
7781
7782 SDLoc DL(Op);
7783 if (DstVT == MVT::f16) {
7784 // TODO: Handle strictfp
7785 if (Op.getOpcode() != ISD::FP_ROUND)
7786 return Op;
7787
7788 if (!Subtarget->has16BitInsts()) {
7789 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7790 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7791 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7792 }
7793 if (Op->getFlags().hasApproximateFuncs()) {
7794 SDValue Flags = Op.getOperand(1);
7795 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7796 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7797 }
7798 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7799 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7800 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7801 }
7802
7803 assert(DstVT.getScalarType() == MVT::bf16 &&
7804 "custom lower FP_ROUND for f16 or bf16");
7805 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7806
7807 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7808 // hardware f32 -> bf16 instruction.
7809 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7810 MVT::f32;
7811 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7812 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7813 DAG.getTargetConstant(0, DL, MVT::i32));
7814}
7815
7816SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7817 SelectionDAG &DAG) const {
7818 EVT VT = Op.getValueType();
7819 const MachineFunction &MF = DAG.getMachineFunction();
7820 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7821 bool IsIEEEMode = Info->getMode().IEEE;
7822
7823 // FIXME: Assert during selection that this is only selected for
7824 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7825 // mode functions, but this happens to be OK since it's only done in cases
7826 // where there is known no sNaN.
7827 if (IsIEEEMode)
7828 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7829
7830 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7831 VT == MVT::v16bf16)
7832 return splitBinaryVectorOp(Op, DAG);
7833 return Op;
7834}
7835
7836SDValue
7837SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7838 SelectionDAG &DAG) const {
7839 EVT VT = Op.getValueType();
7840 const MachineFunction &MF = DAG.getMachineFunction();
7841 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7842 bool IsIEEEMode = Info->getMode().IEEE;
7843
7844 if (IsIEEEMode)
7845 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7846
7847 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7848 VT == MVT::v16bf16)
7849 return splitBinaryVectorOp(Op, DAG);
7850 return Op;
7851}
7852
7853SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7854 SelectionDAG &DAG) const {
7855 EVT VT = Op.getValueType();
7856 if (VT.isVector())
7857 return splitBinaryVectorOp(Op, DAG);
7858
7859 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7860 !Subtarget->hasMinimum3Maximum3F16() &&
7861 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7862 "should not need to widen f16 minimum/maximum to v2f16");
7863
7864 // Widen f16 operation to v2f16
7865
7866 // fminimum f16:x, f16:y ->
7867 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7868 // (v2f16 (scalar_to_vector y))), 0
7869 SDLoc SL(Op);
7870 SDValue WideSrc0 =
7871 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7872 SDValue WideSrc1 =
7873 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7874
7875 SDValue Widened =
7876 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7877
7878 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7879 DAG.getConstant(0, SL, MVT::i32));
7880}
7881
7882SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7883 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7884 EVT VT = Op.getValueType();
7885 assert(VT == MVT::f16);
7886
7887 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7888 EVT ExpVT = Exp.getValueType();
7889 if (ExpVT == MVT::i16)
7890 return Op;
7891
7892 SDLoc DL(Op);
7893
7894 // Correct the exponent type for f16 to i16.
7895 // Clamp the range of the exponent to the instruction's range.
7896
7897 // TODO: This should be a generic narrowing legalization, and can easily be
7898 // for GlobalISel.
7899
7900 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7901 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7902
7903 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7904 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7905
7906 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7907
7908 if (IsStrict) {
7909 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7910 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7911 }
7912
7913 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7914}
7915
7917 switch (Op->getOpcode()) {
7918 case ISD::SRA:
7919 case ISD::SMIN:
7920 case ISD::SMAX:
7921 return ISD::SIGN_EXTEND;
7922 case ISD::SRL:
7923 case ISD::UMIN:
7924 case ISD::UMAX:
7925 return ISD::ZERO_EXTEND;
7926 case ISD::ADD:
7927 case ISD::SUB:
7928 case ISD::AND:
7929 case ISD::OR:
7930 case ISD::XOR:
7931 case ISD::SHL:
7932 case ISD::SELECT:
7933 case ISD::MUL:
7934 // operation result won't be influenced by garbage high bits.
7935 // TODO: are all of those cases correct, and are there more?
7936 return ISD::ANY_EXTEND;
7937 case ISD::SETCC: {
7938 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7940 }
7941 default:
7942 llvm_unreachable("unexpected opcode!");
7943 }
7944}
7945
7946SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7947 DAGCombinerInfo &DCI) const {
7948 const unsigned Opc = Op.getOpcode();
7949 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7950 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7951 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7952 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7953 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7954
7955 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7956 : Op->getOperand(0).getValueType();
7957 auto ExtTy = OpTy.changeElementType(MVT::i32);
7958
7959 if (DCI.isBeforeLegalizeOps() ||
7960 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7961 return SDValue();
7962
7963 auto &DAG = DCI.DAG;
7964
7965 SDLoc DL(Op);
7966 SDValue LHS;
7967 SDValue RHS;
7968 if (Opc == ISD::SELECT) {
7969 LHS = Op->getOperand(1);
7970 RHS = Op->getOperand(2);
7971 } else {
7972 LHS = Op->getOperand(0);
7973 RHS = Op->getOperand(1);
7974 }
7975
7976 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7977 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7978
7979 // Special case: for shifts, the RHS always needs a zext.
7980 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7981 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7982 else
7983 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7984
7985 // setcc always return i1/i1 vec so no need to truncate after.
7986 if (Opc == ISD::SETCC) {
7987 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7988 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7989 }
7990
7991 // For other ops, we extend the operation's return type as well so we need to
7992 // truncate back to the original type.
7993 SDValue NewVal;
7994 if (Opc == ISD::SELECT)
7995 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7996 else
7997 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7998
7999 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8000}
8001
8002SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8003 SDValue Mag = Op.getOperand(0);
8004 EVT MagVT = Mag.getValueType();
8005
8006 if (MagVT.getVectorNumElements() > 2)
8007 return splitBinaryVectorOp(Op, DAG);
8008
8009 SDValue Sign = Op.getOperand(1);
8010 EVT SignVT = Sign.getValueType();
8011
8012 if (MagVT == SignVT)
8013 return Op;
8014
8015 // fcopysign v2f16:mag, v2f32:sign ->
8016 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8017
8018 SDLoc SL(Op);
8019 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8020 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8021
8022 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8023
8024 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8025}
8026
8027// Custom lowering for vector multiplications and s_mul_u64.
8028SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8029 EVT VT = Op.getValueType();
8030
8031 // Split vector operands.
8032 if (VT.isVector())
8033 return splitBinaryVectorOp(Op, DAG);
8034
8035 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8036
8037 // There are four ways to lower s_mul_u64:
8038 //
8039 // 1. If all the operands are uniform, then we lower it as it is.
8040 //
8041 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8042 // multiplications because there is not a vector equivalent of s_mul_u64.
8043 //
8044 // 3. If the cost model decides that it is more efficient to use vector
8045 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8046 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8047 //
8048 // 4. If the cost model decides to use vector registers and both of the
8049 // operands are zero-extended/sign-extended from 32-bits, then we split the
8050 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8051 // possible to check if the operands are zero-extended or sign-extended in
8052 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8053 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8054 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8055 // If the cost model decides that we have to use vector registers, then
8056 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8057 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8058 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8059 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8060 // SIInstrInfo.cpp .
8061
8062 if (Op->isDivergent())
8063 return SDValue();
8064
8065 SDValue Op0 = Op.getOperand(0);
8066 SDValue Op1 = Op.getOperand(1);
8067 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8068 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8069 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8070 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8071 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8072 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8073 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8074 SDLoc SL(Op);
8075 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8076 return SDValue(
8077 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8078 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8079 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8080 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8081 return SDValue(
8082 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8083 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8084 return Op;
8085}
8086
8087SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8088 EVT VT = Op.getValueType();
8089 SDLoc SL(Op);
8090 SDValue LHS = Op.getOperand(0);
8091 SDValue RHS = Op.getOperand(1);
8092 bool isSigned = Op.getOpcode() == ISD::SMULO;
8093
8094 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8095 const APInt &C = RHSC->getAPIntValue();
8096 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8097 if (C.isPowerOf2()) {
8098 // smulo(x, signed_min) is same as umulo(x, signed_min).
8099 bool UseArithShift = isSigned && !C.isMinSignedValue();
8100 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8101 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8102 SDValue Overflow =
8103 DAG.getSetCC(SL, MVT::i1,
8104 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8105 Result, ShiftAmt),
8106 LHS, ISD::SETNE);
8107 return DAG.getMergeValues({Result, Overflow}, SL);
8108 }
8109 }
8110
8111 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8112 SDValue Top =
8113 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8114
8115 SDValue Sign = isSigned
8116 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8117 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8118 SL, MVT::i32))
8119 : DAG.getConstant(0, SL, VT);
8120 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8121
8122 return DAG.getMergeValues({Result, Overflow}, SL);
8123}
8124
8125SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8126 if (Op->isDivergent()) {
8127 // Select to V_MAD_[IU]64_[IU]32.
8128 return Op;
8129 }
8130 if (Subtarget->hasSMulHi()) {
8131 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8132 return SDValue();
8133 }
8134 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8135 // calculate the high part, so we might as well do the whole thing with
8136 // V_MAD_[IU]64_[IU]32.
8137 return Op;
8138}
8139
8140SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8141 if (!Subtarget->isTrapHandlerEnabled() ||
8142 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8143 return lowerTrapEndpgm(Op, DAG);
8144
8145 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8146 : lowerTrapHsaQueuePtr(Op, DAG);
8147}
8148
8149SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8150 SDLoc SL(Op);
8151 SDValue Chain = Op.getOperand(0);
8152 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8153}
8154
8155SDValue
8156SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8157 const SDLoc &DL, Align Alignment,
8158 ImplicitParameter Param) const {
8159 MachineFunction &MF = DAG.getMachineFunction();
8160 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8161 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8162 MachinePointerInfo PtrInfo =
8164 return DAG.getLoad(
8165 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8167}
8168
8169SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8170 SelectionDAG &DAG) const {
8171 SDLoc SL(Op);
8172 SDValue Chain = Op.getOperand(0);
8173
8174 SDValue QueuePtr;
8175 // For code object version 5, QueuePtr is passed through implicit kernarg.
8176 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8178 QueuePtr =
8179 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8180 } else {
8181 MachineFunction &MF = DAG.getMachineFunction();
8182 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8183 Register UserSGPR = Info->getQueuePtrUserSGPR();
8184
8185 if (UserSGPR == AMDGPU::NoRegister) {
8186 // We probably are in a function incorrectly marked with
8187 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8188 // trap, so just use a null pointer.
8189 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8190 } else {
8191 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8192 MVT::i64);
8193 }
8194 }
8195
8196 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8197 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8198
8199 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8200 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8201 ToReg.getValue(1)};
8202 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8203}
8204
8205SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8206 SDLoc SL(Op);
8207 SDValue Chain = Op.getOperand(0);
8208
8209 // We need to simulate the 's_trap 2' instruction on targets that run in
8210 // PRIV=1 (where it is treated as a nop).
8211 if (Subtarget->hasPrivEnabledTrap2NopBug())
8212 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8213
8214 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8215 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8216 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8217}
8218
8219SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8220 SDLoc SL(Op);
8221 SDValue Chain = Op.getOperand(0);
8222 MachineFunction &MF = DAG.getMachineFunction();
8223
8224 if (!Subtarget->isTrapHandlerEnabled() ||
8225 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8226 LLVMContext &Ctx = MF.getFunction().getContext();
8227 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8228 "debugtrap handler not supported",
8229 Op.getDebugLoc(), DS_Warning));
8230 return Chain;
8231 }
8232
8233 uint64_t TrapID =
8234 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8235 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8236 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8237}
8238
8239SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8240 SelectionDAG &DAG) const {
8241 if (Subtarget->hasApertureRegs()) {
8242 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8243 ? AMDGPU::SRC_SHARED_BASE
8244 : AMDGPU::SRC_PRIVATE_BASE;
8245 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8246 !Subtarget->hasGloballyAddressableScratch()) &&
8247 "Cannot use src_private_base with globally addressable scratch!");
8248 // Note: this feature (register) is broken. When used as a 32-bit operand,
8249 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8250 // bits.
8251 //
8252 // To work around the issue, emit a 64 bit copy from this register
8253 // then extract the high bits. Note that this shouldn't even result in a
8254 // shift being emitted and simply become a pair of registers (e.g.):
8255 // s_mov_b64 s[6:7], src_shared_base
8256 // v_mov_b32_e32 v1, s7
8257 SDValue Copy =
8258 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8259 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8260 }
8261
8262 // For code object version 5, private_base and shared_base are passed through
8263 // implicit kernargs.
8264 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8268 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8269 }
8270
8271 MachineFunction &MF = DAG.getMachineFunction();
8272 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8273 Register UserSGPR = Info->getQueuePtrUserSGPR();
8274 if (UserSGPR == AMDGPU::NoRegister) {
8275 // We probably are in a function incorrectly marked with
8276 // amdgpu-no-queue-ptr. This is undefined.
8277 return DAG.getPOISON(MVT::i32);
8278 }
8279
8280 SDValue QueuePtr =
8281 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8282
8283 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8284 // private_segment_aperture_base_hi.
8285 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8286
8287 SDValue Ptr =
8288 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8289
8290 // TODO: Use custom target PseudoSourceValue.
8291 // TODO: We should use the value from the IR intrinsic call, but it might not
8292 // be available and how do we get it?
8293 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8294 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8295 commonAlignment(Align(64), StructOffset),
8298}
8299
8300/// Return true if the value is a known valid address, such that a null check is
8301/// not necessary.
8303 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8305 return true;
8306
8307 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8308 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8309
8310 // TODO: Search through arithmetic, handle arguments and loads
8311 // marked nonnull.
8312 return false;
8313}
8314
8315SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8316 SelectionDAG &DAG) const {
8317 SDLoc SL(Op);
8318
8319 const AMDGPUTargetMachine &TM =
8320 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8321
8322 unsigned DestAS, SrcAS;
8323 SDValue Src;
8324 bool IsNonNull = false;
8325 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8326 SrcAS = ASC->getSrcAddressSpace();
8327 Src = ASC->getOperand(0);
8328 DestAS = ASC->getDestAddressSpace();
8329 } else {
8330 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8331 Op.getConstantOperandVal(0) ==
8332 Intrinsic::amdgcn_addrspacecast_nonnull);
8333 Src = Op->getOperand(1);
8334 SrcAS = Op->getConstantOperandVal(2);
8335 DestAS = Op->getConstantOperandVal(3);
8336 IsNonNull = true;
8337 }
8338
8339 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8340
8341 // flat -> local/private
8342 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8343 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8344 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8345 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8346
8347 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8348 Subtarget->hasGloballyAddressableScratch()) {
8349 // flat -> private with globally addressable scratch: subtract
8350 // src_flat_scratch_base_lo.
8351 SDValue FlatScratchBaseLo(
8352 DAG.getMachineNode(
8353 AMDGPU::S_MOV_B32, SL, MVT::i32,
8354 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8355 0);
8356 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8357 }
8358
8359 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8360 return Ptr;
8361
8362 unsigned NullVal = TM.getNullPointerValue(DestAS);
8363 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8364 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8365
8366 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8367 SegmentNullPtr);
8368 }
8369 }
8370
8371 // local/private -> flat
8372 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8373 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8374 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8375 SDValue CvtPtr;
8376 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8377 Subtarget->hasGloballyAddressableScratch()) {
8378 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8379 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8380 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8381 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8382 ThreadID = DAG.getNode(
8383 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8384 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8385 AllOnes, ThreadID);
8386 if (Subtarget->isWave64())
8387 ThreadID = DAG.getNode(
8388 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8389 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8390 AllOnes, ThreadID);
8391 SDValue ShAmt = DAG.getShiftAmountConstant(
8392 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8393 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8394 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8395 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8396 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8397 // 64-bit hi:lo value.
8398 SDValue FlatScratchBase = {
8399 DAG.getMachineNode(
8400 AMDGPU::S_MOV_B64, SL, MVT::i64,
8401 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8402 0};
8403 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8404 } else {
8405 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8406 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8407 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8408 }
8409
8410 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8411 return CvtPtr;
8412
8413 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8414 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8415
8416 SDValue NonNull =
8417 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8418
8419 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8420 FlatNullPtr);
8421 }
8422 }
8423
8424 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8425 Op.getValueType() == MVT::i64) {
8426 const SIMachineFunctionInfo *Info =
8427 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8428 if (Info->get32BitAddressHighBits() == 0)
8429 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
8430
8431 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8432 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8433 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8434 }
8435
8436 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8437 Src.getValueType() == MVT::i64)
8438 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8439
8440 // global <-> flat are no-ops and never emitted.
8441
8442 // Invalid casts are poison.
8443 return DAG.getPOISON(Op->getValueType(0));
8444}
8445
8446// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8447// the small vector and inserting them into the big vector. That is better than
8448// the default expansion of doing it via a stack slot. Even though the use of
8449// the stack slot would be optimized away afterwards, the stack slot itself
8450// remains.
8451SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8452 SelectionDAG &DAG) const {
8453 SDValue Vec = Op.getOperand(0);
8454 SDValue Ins = Op.getOperand(1);
8455 SDValue Idx = Op.getOperand(2);
8456 EVT VecVT = Vec.getValueType();
8457 EVT InsVT = Ins.getValueType();
8458 EVT EltVT = VecVT.getVectorElementType();
8459 unsigned InsNumElts = InsVT.getVectorNumElements();
8460 unsigned IdxVal = Idx->getAsZExtVal();
8461 SDLoc SL(Op);
8462
8463 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8464 // Insert 32-bit registers at a time.
8465 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8466
8467 unsigned VecNumElts = VecVT.getVectorNumElements();
8468 EVT NewVecVT =
8469 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8470 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8472 MVT::i32, InsNumElts / 2);
8473
8474 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8475 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8476
8477 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8478 SDValue Elt;
8479 if (InsNumElts == 2) {
8480 Elt = Ins;
8481 } else {
8482 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8483 DAG.getConstant(I, SL, MVT::i32));
8484 }
8485 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8486 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8487 }
8488
8489 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8490 }
8491
8492 for (unsigned I = 0; I != InsNumElts; ++I) {
8493 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8494 DAG.getConstant(I, SL, MVT::i32));
8495 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8496 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8497 }
8498 return Vec;
8499}
8500
8501SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8502 SelectionDAG &DAG) const {
8503 SDValue Vec = Op.getOperand(0);
8504 SDValue InsVal = Op.getOperand(1);
8505 SDValue Idx = Op.getOperand(2);
8506 EVT VecVT = Vec.getValueType();
8507 EVT EltVT = VecVT.getVectorElementType();
8508 unsigned VecSize = VecVT.getSizeInBits();
8509 unsigned EltSize = EltVT.getSizeInBits();
8510 SDLoc SL(Op);
8511
8512 // Specially handle the case of v4i16 with static indexing.
8513 unsigned NumElts = VecVT.getVectorNumElements();
8514 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8515 if (NumElts == 4 && EltSize == 16 && KIdx) {
8516 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8517
8518 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8519 DAG.getConstant(0, SL, MVT::i32));
8520 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8521 DAG.getConstant(1, SL, MVT::i32));
8522
8523 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8524 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8525
8526 unsigned Idx = KIdx->getZExtValue();
8527 bool InsertLo = Idx < 2;
8528 SDValue InsHalf = DAG.getNode(
8529 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8530 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8531 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8532
8533 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8534
8535 SDValue Concat =
8536 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8537 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8538
8539 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8540 }
8541
8542 // Static indexing does not lower to stack access, and hence there is no need
8543 // for special custom lowering to avoid stack access.
8544 if (isa<ConstantSDNode>(Idx))
8545 return SDValue();
8546
8547 // Avoid stack access for dynamic indexing by custom lowering to
8548 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8549
8550 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8551
8552 MVT IntVT = MVT::getIntegerVT(VecSize);
8553
8554 // Convert vector index to bit-index and get the required bit mask.
8555 assert(isPowerOf2_32(EltSize));
8556 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8557 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8558 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8559 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8560 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8561
8562 // 1. Create a congruent vector with the target value in each element.
8563 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8564 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8565
8566 // 2. Mask off all other indices except the required index within (1).
8567 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8568
8569 // 3. Mask off the required index within the target vector.
8570 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8571 SDValue RHS =
8572 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8573
8574 // 4. Get (2) and (3) ORed into the target vector.
8575 SDValue BFI =
8576 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8577
8578 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8579}
8580
8581SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8582 SelectionDAG &DAG) const {
8583 SDLoc SL(Op);
8584
8585 EVT ResultVT = Op.getValueType();
8586 SDValue Vec = Op.getOperand(0);
8587 SDValue Idx = Op.getOperand(1);
8588 EVT VecVT = Vec.getValueType();
8589 unsigned VecSize = VecVT.getSizeInBits();
8590 EVT EltVT = VecVT.getVectorElementType();
8591
8592 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8593
8594 // Make sure we do any optimizations that will make it easier to fold
8595 // source modifiers before obscuring it with bit operations.
8596
8597 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8598 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8599 return Combined;
8600
8601 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8602 SDValue Lo, Hi;
8603 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8604
8605 if (VecSize == 128) {
8606 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8607 Lo = DAG.getBitcast(LoVT,
8608 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8609 DAG.getConstant(0, SL, MVT::i32)));
8610 Hi = DAG.getBitcast(HiVT,
8611 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8612 DAG.getConstant(1, SL, MVT::i32)));
8613 } else if (VecSize == 256) {
8614 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8615 SDValue Parts[4];
8616 for (unsigned P = 0; P < 4; ++P) {
8617 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8618 DAG.getConstant(P, SL, MVT::i32));
8619 }
8620
8621 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8622 Parts[0], Parts[1]));
8623 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8624 Parts[2], Parts[3]));
8625 } else {
8626 assert(VecSize == 512);
8627
8628 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8629 SDValue Parts[8];
8630 for (unsigned P = 0; P < 8; ++P) {
8631 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8632 DAG.getConstant(P, SL, MVT::i32));
8633 }
8634
8635 Lo = DAG.getBitcast(LoVT,
8636 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8637 Parts[0], Parts[1], Parts[2], Parts[3]));
8638 Hi = DAG.getBitcast(HiVT,
8639 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8640 Parts[4], Parts[5], Parts[6], Parts[7]));
8641 }
8642
8643 EVT IdxVT = Idx.getValueType();
8644 unsigned NElem = VecVT.getVectorNumElements();
8645 assert(isPowerOf2_32(NElem));
8646 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8647 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8648 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8649 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8650 }
8651
8652 assert(VecSize <= 64);
8653
8654 MVT IntVT = MVT::getIntegerVT(VecSize);
8655
8656 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8657 SDValue VecBC = peekThroughBitcasts(Vec);
8658 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8659 SDValue Src = VecBC.getOperand(0);
8660 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8661 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8662 }
8663
8664 unsigned EltSize = EltVT.getSizeInBits();
8665 assert(isPowerOf2_32(EltSize));
8666
8667 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8668
8669 // Convert vector index to bit-index (* EltSize)
8670 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8671
8672 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8673 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8674
8675 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8676 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8677 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8678 }
8679
8680 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8681}
8682
8683static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8684 assert(Elt % 2 == 0);
8685 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8686}
8687
8688static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8689 assert(Elt % 2 == 0);
8690 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8691 !(Mask[Elt + 1] & 1);
8692}
8693
8694SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8695 SelectionDAG &DAG) const {
8696 SDLoc SL(Op);
8697 EVT ResultVT = Op.getValueType();
8698 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8699 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8700 const int NewSrcNumElts = 2;
8701 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8702 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8703
8704 // Break up the shuffle into registers sized pieces.
8705 //
8706 // We're trying to form sub-shuffles that the register allocation pipeline
8707 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8708 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8709 // pair of copies into a consecutive register copy, so use the ordinary
8710 // extract_vector_elt lowering unless we can use the shuffle.
8711 //
8712 // TODO: This is a bit of hack, and we should probably always use
8713 // extract_subvector for the largest possible subvector we can (or at least
8714 // use it for PackVT aligned pieces). However we have worse support for
8715 // combines on them don't directly treat extract_subvector / insert_subvector
8716 // as legal. The DAG scheduler also ends up doing a worse job with the
8717 // extract_subvectors.
8718 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8719
8720 // vector_shuffle <0,1,6,7> lhs, rhs
8721 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8722 //
8723 // vector_shuffle <6,7,2,3> lhs, rhs
8724 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8725 //
8726 // vector_shuffle <6,7,0,1> lhs, rhs
8727 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8728
8729 // Avoid scalarizing when both halves are reading from consecutive elements.
8730
8731 // If we're treating 2 element shuffles as legal, also create odd-to-even
8732 // shuffles of neighboring pairs.
8733 //
8734 // vector_shuffle <3,2,7,6> lhs, rhs
8735 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8736 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8737
8739 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8740 if (ShouldUseConsecutiveExtract &&
8742 const int Idx = SVN->getMaskElt(I);
8743 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8744 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8745 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8746 SVN->getOperand(VecIdx),
8747 DAG.getConstant(EltIdx, SL, MVT::i32));
8748 Pieces.push_back(SubVec);
8749 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8751 int Idx0 = SVN->getMaskElt(I);
8752 int Idx1 = SVN->getMaskElt(I + 1);
8753
8754 SDValue SrcOp0 = SVN->getOperand(0);
8755 SDValue SrcOp1 = SrcOp0;
8756 if (Idx0 >= SrcNumElts) {
8757 SrcOp0 = SVN->getOperand(1);
8758 Idx0 -= SrcNumElts;
8759 }
8760
8761 if (Idx1 >= SrcNumElts) {
8762 SrcOp1 = SVN->getOperand(1);
8763 Idx1 -= SrcNumElts;
8764 }
8765
8766 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8767 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8768
8769 // Extract nearest even aligned piece.
8770 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8771 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8772 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8773 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8774
8775 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8776 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8777
8778 SDValue Result0 = SubVec0;
8779 SDValue Result1 = SubVec0;
8780
8781 if (SubVec0 != SubVec1) {
8782 NewMaskIdx1 += NewSrcNumElts;
8783 Result1 = SubVec1;
8784 } else {
8785 Result1 = DAG.getPOISON(PackVT);
8786 }
8787
8788 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8789 {NewMaskIdx0, NewMaskIdx1});
8790 Pieces.push_back(Shuf);
8791 } else {
8792 const int Idx0 = SVN->getMaskElt(I);
8793 const int Idx1 = SVN->getMaskElt(I + 1);
8794 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8795 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8796 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8797 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8798
8799 SDValue Vec0 = SVN->getOperand(VecIdx0);
8800 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8801 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8802
8803 SDValue Vec1 = SVN->getOperand(VecIdx1);
8804 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8805 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8806 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8807 }
8808 }
8809
8810 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8811}
8812
8813SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8814 SelectionDAG &DAG) const {
8815 SDValue SVal = Op.getOperand(0);
8816 EVT ResultVT = Op.getValueType();
8817 EVT SValVT = SVal.getValueType();
8818 SDValue UndefVal = DAG.getPOISON(SValVT);
8819 SDLoc SL(Op);
8820
8822 VElts.push_back(SVal);
8823 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8824 VElts.push_back(UndefVal);
8825
8826 return DAG.getBuildVector(ResultVT, SL, VElts);
8827}
8828
8829SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8830 SelectionDAG &DAG) const {
8831 SDLoc SL(Op);
8832 EVT VT = Op.getValueType();
8833
8834 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8835 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8836
8837 SDValue Lo = Op.getOperand(0);
8838 SDValue Hi = Op.getOperand(1);
8839
8840 // Avoid adding defined bits with the zero_extend.
8841 if (Hi.isUndef()) {
8842 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8843 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8844 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8845 }
8846
8847 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8848 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8849
8850 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8851 DAG.getConstant(16, SL, MVT::i32));
8852 if (Lo.isUndef())
8853 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8854
8855 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8856 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8857
8858 SDValue Or =
8859 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8860 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8861 }
8862
8863 // Split into 2-element chunks.
8864 const unsigned NumParts = VT.getVectorNumElements() / 2;
8865 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8866 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8867
8869 for (unsigned P = 0; P < NumParts; ++P) {
8870 SDValue Vec = DAG.getBuildVector(
8871 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8872 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8873 }
8874
8875 SDValue Blend =
8876 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8877 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8878}
8879
8881 const GlobalAddressSDNode *GA) const {
8882 // OSes that use ELF REL relocations (instead of RELA) can only store a
8883 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8884 // which can create arbitrary 64-bit addends. (This is only a problem for
8885 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8886 // the high 32 bits of the addend.)
8887 //
8888 // This should be kept in sync with how HasRelocationAddend is initialized in
8889 // the constructor of ELFAMDGPUAsmBackend.
8890 if (!Subtarget->isAmdHsaOS())
8891 return false;
8892
8893 // We can fold offsets for anything that doesn't require a GOT relocation.
8894 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8898}
8899
8900static SDValue
8902 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8903 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8904 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8905 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8906 // lowered to the following code sequence:
8907 //
8908 // For constant address space:
8909 // s_getpc_b64 s[0:1]
8910 // s_add_u32 s0, s0, $symbol
8911 // s_addc_u32 s1, s1, 0
8912 //
8913 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8914 // a fixup or relocation is emitted to replace $symbol with a literal
8915 // constant, which is a pc-relative offset from the encoding of the $symbol
8916 // operand to the global variable.
8917 //
8918 // For global address space:
8919 // s_getpc_b64 s[0:1]
8920 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8921 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8922 //
8923 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8924 // fixups or relocations are emitted to replace $symbol@*@lo and
8925 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8926 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8927 // operand to the global variable.
8928 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8929 assert(GAFlags != SIInstrInfo::MO_NONE);
8930
8931 SDValue Ptr =
8932 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8933 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8934 }
8935
8936 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8937 SDValue PtrHi;
8938 if (GAFlags == SIInstrInfo::MO_NONE)
8939 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8940 else
8941 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8942 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8943}
8944
8945SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8946 SDValue Op,
8947 SelectionDAG &DAG) const {
8948 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8949 SDLoc DL(GSD);
8950 EVT PtrVT = Op.getValueType();
8951
8952 const GlobalValue *GV = GSD->getGlobal();
8958 GV->hasExternalLinkage()) {
8959 Type *Ty = GV->getValueType();
8960 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8961 // zero-sized type in other languages to declare the dynamic shared
8962 // memory which size is not known at the compile time. They will be
8963 // allocated by the runtime and placed directly after the static
8964 // allocated ones. They all share the same offset.
8965 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8966 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8967 // Adjust alignment for that dynamic shared memory array.
8970 MFI->setUsesDynamicLDS(true);
8971 return SDValue(
8972 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8973 }
8974 }
8976 }
8977
8979 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8981 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8982 }
8983
8984 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8985 if (Subtarget->has64BitLiterals()) {
8987 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8988 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8989 0);
8990 }
8991
8992 SDValue AddrLo = DAG.getTargetGlobalAddress(
8993 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8994 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8995
8996 SDValue AddrHi = DAG.getTargetGlobalAddress(
8997 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8998 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8999
9000 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9001 }
9002
9003 if (shouldEmitFixup(GV))
9004 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9005
9006 if (shouldEmitPCReloc(GV))
9007 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9009
9010 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9012 PointerType *PtrTy =
9014 const DataLayout &DataLayout = DAG.getDataLayout();
9015 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9016 MachinePointerInfo PtrInfo =
9018
9019 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9022}
9023
9025 const SDLoc &DL, SDValue V) const {
9026 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9027 // the destination register.
9028 //
9029 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9030 // so we will end up with redundant moves to m0.
9031 //
9032 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9033
9034 // A Null SDValue creates a glue result.
9035 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9036 V, Chain);
9037 return SDValue(M0, 0);
9038}
9039
9040SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9041 MVT VT,
9042 unsigned Offset) const {
9043 SDLoc SL(Op);
9044 SDValue Param = lowerKernargMemParameter(
9045 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9046 // The local size values will have the hi 16-bits as zero.
9047 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9048 DAG.getValueType(VT));
9049}
9050
9052 EVT VT) {
9055 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9056 return DAG.getPOISON(VT);
9057}
9058
9060 EVT VT) {
9063 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9064 return DAG.getPOISON(VT);
9065}
9066
9068 ArrayRef<SDValue> Elts) {
9069 assert(!Elts.empty());
9070 MVT Type;
9071 unsigned NumElts = Elts.size();
9072
9073 if (NumElts <= 12) {
9074 Type = MVT::getVectorVT(MVT::f32, NumElts);
9075 } else {
9076 assert(Elts.size() <= 16);
9077 Type = MVT::v16f32;
9078 NumElts = 16;
9079 }
9080
9081 SmallVector<SDValue, 16> VecElts(NumElts);
9082 for (unsigned i = 0; i < Elts.size(); ++i) {
9083 SDValue Elt = Elts[i];
9084 if (Elt.getValueType() != MVT::f32)
9085 Elt = DAG.getBitcast(MVT::f32, Elt);
9086 VecElts[i] = Elt;
9087 }
9088 for (unsigned i = Elts.size(); i < NumElts; ++i)
9089 VecElts[i] = DAG.getPOISON(MVT::f32);
9090
9091 if (NumElts == 1)
9092 return VecElts[0];
9093 return DAG.getBuildVector(Type, DL, VecElts);
9094}
9095
9096static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9097 SDValue Src, int ExtraElts) {
9098 EVT SrcVT = Src.getValueType();
9099
9101
9102 if (SrcVT.isVector())
9103 DAG.ExtractVectorElements(Src, Elts);
9104 else
9105 Elts.push_back(Src);
9106
9107 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9108 while (ExtraElts--)
9109 Elts.push_back(Undef);
9110
9111 return DAG.getBuildVector(CastVT, DL, Elts);
9112}
9113
9114// Re-construct the required return value for a image load intrinsic.
9115// This is more complicated due to the optional use TexFailCtrl which means the
9116// required return type is an aggregate
9118 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9119 bool Unpacked, bool IsD16, int DMaskPop,
9120 int NumVDataDwords, bool IsAtomicPacked16Bit,
9121 const SDLoc &DL) {
9122 // Determine the required return type. This is the same regardless of
9123 // IsTexFail flag
9124 EVT ReqRetVT = ResultTypes[0];
9125 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9126 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9127 ? (ReqRetNumElts + 1) / 2
9128 : ReqRetNumElts;
9129
9130 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9131
9132 MVT DataDwordVT =
9133 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9134
9135 MVT MaskPopVT =
9136 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9137
9138 SDValue Data(Result, 0);
9139 SDValue TexFail;
9140
9141 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9142 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9143 if (MaskPopVT.isVector()) {
9144 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9145 SDValue(Result, 0), ZeroIdx);
9146 } else {
9147 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9148 SDValue(Result, 0), ZeroIdx);
9149 }
9150 }
9151
9152 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9153 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9154 NumDataDwords - MaskPopDwords);
9155
9156 if (IsD16)
9157 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9158
9159 EVT LegalReqRetVT = ReqRetVT;
9160 if (!ReqRetVT.isVector()) {
9161 if (!Data.getValueType().isInteger())
9162 Data = DAG.getNode(ISD::BITCAST, DL,
9163 Data.getValueType().changeTypeToInteger(), Data);
9164 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9165 } else {
9166 // We need to widen the return vector to a legal type
9167 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9168 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9169 LegalReqRetVT =
9171 ReqRetVT.getVectorNumElements() + 1);
9172 }
9173 }
9174 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9175
9176 if (IsTexFail) {
9177 TexFail =
9178 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9179 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9180
9181 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9182 }
9183
9184 if (Result->getNumValues() == 1)
9185 return Data;
9186
9187 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9188}
9189
9190static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9191 SDValue *LWE, bool &IsTexFail) {
9192 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9193
9194 uint64_t Value = TexFailCtrlConst->getZExtValue();
9195 if (Value) {
9196 IsTexFail = true;
9197 }
9198
9199 SDLoc DL(TexFailCtrlConst);
9200 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9201 Value &= ~(uint64_t)0x1;
9202 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9203 Value &= ~(uint64_t)0x2;
9204
9205 return Value == 0;
9206}
9207
9209 MVT PackVectorVT,
9210 SmallVectorImpl<SDValue> &PackedAddrs,
9211 unsigned DimIdx, unsigned EndIdx,
9212 unsigned NumGradients) {
9213 SDLoc DL(Op);
9214 for (unsigned I = DimIdx; I < EndIdx; I++) {
9215 SDValue Addr = Op.getOperand(I);
9216
9217 // Gradients are packed with undef for each coordinate.
9218 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9219 // 1D: undef,dx/dh; undef,dx/dv
9220 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9221 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9222 if (((I + 1) >= EndIdx) ||
9223 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9224 I == DimIdx + NumGradients - 1))) {
9225 if (Addr.getValueType() != MVT::i16)
9226 Addr = DAG.getBitcast(MVT::i16, Addr);
9227 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9228 } else {
9229 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9230 I++;
9231 }
9232 Addr = DAG.getBitcast(MVT::f32, Addr);
9233 PackedAddrs.push_back(Addr);
9234 }
9235}
9236
9237SDValue SITargetLowering::lowerImage(SDValue Op,
9239 SelectionDAG &DAG, bool WithChain) const {
9240 SDLoc DL(Op);
9241 MachineFunction &MF = DAG.getMachineFunction();
9242 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9243 unsigned IntrOpcode = Intr->BaseOpcode;
9244 // For image atomic: use no-return opcode if result is unused.
9245 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9246 !Op.getNode()->hasAnyUseOfValue(0))
9247 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9248 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9250 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9251 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9252 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9253 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9254
9255 SmallVector<EVT, 3> ResultTypes(Op->values());
9256 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9257 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9258 ResultTypes.erase(&ResultTypes[0]);
9259
9260 bool IsD16 = false;
9261 bool IsG16 = false;
9262 bool IsA16 = false;
9263 SDValue VData;
9264 int NumVDataDwords = 0;
9265 bool AdjustRetType = false;
9266 bool IsAtomicPacked16Bit = false;
9267
9268 // Offset of intrinsic arguments
9269 const unsigned ArgOffset = WithChain ? 2 : 1;
9270
9271 unsigned DMask;
9272 unsigned DMaskLanes = 0;
9273
9274 if (BaseOpcode->Atomic) {
9275 VData = Op.getOperand(2);
9276
9277 IsAtomicPacked16Bit =
9278 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9279 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9280 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9281 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9282
9283 bool Is64Bit = VData.getValueSizeInBits() == 64;
9284 if (BaseOpcode->AtomicX2) {
9285 SDValue VData2 = Op.getOperand(3);
9286 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9287 {VData, VData2});
9288 if (Is64Bit)
9289 VData = DAG.getBitcast(MVT::v4i32, VData);
9290
9291 if (!BaseOpcode->NoReturn)
9292 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9293
9294 DMask = Is64Bit ? 0xf : 0x3;
9295 NumVDataDwords = Is64Bit ? 4 : 2;
9296 } else {
9297 DMask = Is64Bit ? 0x3 : 0x1;
9298 NumVDataDwords = Is64Bit ? 2 : 1;
9299 }
9300 } else {
9301 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9302 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9303
9304 if (BaseOpcode->Store) {
9305 VData = Op.getOperand(2);
9306
9307 MVT StoreVT = VData.getSimpleValueType();
9308 if (StoreVT.getScalarType() == MVT::f16) {
9309 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9310 return Op; // D16 is unsupported for this instruction
9311
9312 IsD16 = true;
9313 VData = handleD16VData(VData, DAG, true);
9314 }
9315
9316 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9317 } else if (!BaseOpcode->NoReturn) {
9318 // Work out the num dwords based on the dmask popcount and underlying type
9319 // and whether packing is supported.
9320 MVT LoadVT = ResultTypes[0].getSimpleVT();
9321 if (LoadVT.getScalarType() == MVT::f16) {
9322 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9323 return Op; // D16 is unsupported for this instruction
9324
9325 IsD16 = true;
9326 }
9327
9328 // Confirm that the return type is large enough for the dmask specified
9329 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9330 (!LoadVT.isVector() && DMaskLanes > 1))
9331 return Op;
9332
9333 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9334 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9335 // instructions.
9336 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9337 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9338 NumVDataDwords = (DMaskLanes + 1) / 2;
9339 else
9340 NumVDataDwords = DMaskLanes;
9341
9342 AdjustRetType = true;
9343 }
9344 }
9345
9346 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9348
9349 // Check for 16 bit addresses or derivatives and pack if true.
9350 MVT VAddrVT =
9351 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9352 MVT VAddrScalarVT = VAddrVT.getScalarType();
9353 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9354 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9355
9356 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9357 VAddrScalarVT = VAddrVT.getScalarType();
9358 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9359 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9360
9361 // Push back extra arguments.
9362 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9363 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9364 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9365 // Special handling of bias when A16 is on. Bias is of type half but
9366 // occupies full 32-bit.
9367 SDValue Bias = DAG.getBuildVector(
9368 MVT::v2f16, DL,
9369 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9370 VAddrs.push_back(Bias);
9371 } else {
9372 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9373 "Bias needs to be converted to 16 bit in A16 mode");
9374 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9375 }
9376 }
9377
9378 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9379 // 16 bit gradients are supported, but are tied to the A16 control
9380 // so both gradients and addresses must be 16 bit
9381 LLVM_DEBUG(
9382 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9383 "require 16 bit args for both gradients and addresses");
9384 return Op;
9385 }
9386
9387 if (IsA16) {
9388 if (!ST->hasA16()) {
9389 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9390 "support 16 bit addresses\n");
9391 return Op;
9392 }
9393 }
9394
9395 // We've dealt with incorrect input so we know that if IsA16, IsG16
9396 // are set then we have to compress/pack operands (either address,
9397 // gradient or both)
9398 // In the case where a16 and gradients are tied (no G16 support) then we
9399 // have already verified that both IsA16 and IsG16 are true
9400 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9401 // Activate g16
9402 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9404 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9405 }
9406
9407 // Add gradients (packed or unpacked)
9408 if (IsG16) {
9409 // Pack the gradients
9410 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9411 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9412 ArgOffset + Intr->GradientStart,
9413 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9414 } else {
9415 for (unsigned I = ArgOffset + Intr->GradientStart;
9416 I < ArgOffset + Intr->CoordStart; I++)
9417 VAddrs.push_back(Op.getOperand(I));
9418 }
9419
9420 // Add addresses (packed or unpacked)
9421 if (IsA16) {
9422 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9423 ArgOffset + Intr->CoordStart, VAddrEnd,
9424 0 /* No gradients */);
9425 } else {
9426 // Add uncompressed address
9427 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9428 VAddrs.push_back(Op.getOperand(I));
9429 }
9430
9431 // If the register allocator cannot place the address registers contiguously
9432 // without introducing moves, then using the non-sequential address encoding
9433 // is always preferable, since it saves VALU instructions and is usually a
9434 // wash in terms of code size or even better.
9435 //
9436 // However, we currently have no way of hinting to the register allocator that
9437 // MIMG addresses should be placed contiguously when it is possible to do so,
9438 // so force non-NSA for the common 2-address case as a heuristic.
9439 //
9440 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9441 // allocation when possible.
9442 //
9443 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9444 // set of the remaining addresses.
9445 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9446 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9447 const bool UseNSA = ST->hasNSAEncoding() &&
9448 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9449 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9450 const bool UsePartialNSA =
9451 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9452
9453 SDValue VAddr;
9454 if (UsePartialNSA) {
9455 VAddr = getBuildDwordsVector(DAG, DL,
9456 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9457 } else if (!UseNSA) {
9458 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9459 }
9460
9461 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9462 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9463 SDValue Unorm;
9464 if (!BaseOpcode->Sampler) {
9465 Unorm = True;
9466 } else {
9467 uint64_t UnormConst =
9468 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9469
9470 Unorm = UnormConst ? True : False;
9471 }
9472
9473 SDValue TFE;
9474 SDValue LWE;
9475 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9476 bool IsTexFail = false;
9477 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9478 return Op;
9479
9480 if (IsTexFail) {
9481 if (!DMaskLanes) {
9482 // Expecting to get an error flag since TFC is on - and dmask is 0
9483 // Force dmask to be at least 1 otherwise the instruction will fail
9484 DMask = 0x1;
9485 DMaskLanes = 1;
9486 NumVDataDwords = 1;
9487 }
9488 NumVDataDwords += 1;
9489 AdjustRetType = true;
9490 }
9491
9492 // Has something earlier tagged that the return type needs adjusting
9493 // This happens if the instruction is a load or has set TexFailCtrl flags
9494 if (AdjustRetType) {
9495 // NumVDataDwords reflects the true number of dwords required in the return
9496 // type
9497 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9498 // This is a no-op load. This can be eliminated
9499 SDValue Undef = DAG.getPOISON(Op.getValueType());
9500 if (isa<MemSDNode>(Op))
9501 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9502 return Undef;
9503 }
9504
9505 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9506 MVT::i32, NumVDataDwords)
9507 : MVT::i32;
9508
9509 ResultTypes[0] = NewVT;
9510 if (ResultTypes.size() == 3) {
9511 // Original result was aggregate type used for TexFailCtrl results
9512 // The actual instruction returns as a vector type which has now been
9513 // created. Remove the aggregate result.
9514 ResultTypes.erase(&ResultTypes[1]);
9515 }
9516 }
9517
9518 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9519 // Keep GLC only when the atomic's result is actually used.
9520 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9522 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9524 return Op;
9525
9527 if (BaseOpcode->Store || BaseOpcode->Atomic)
9528 Ops.push_back(VData); // vdata
9529 if (UsePartialNSA) {
9530 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9531 Ops.push_back(VAddr);
9532 } else if (UseNSA)
9533 append_range(Ops, VAddrs);
9534 else
9535 Ops.push_back(VAddr);
9536 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9537 EVT RsrcVT = Rsrc.getValueType();
9538 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9539 return Op;
9540 Ops.push_back(Rsrc);
9541 if (BaseOpcode->Sampler) {
9542 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9543 if (Samp.getValueType() != MVT::v4i32)
9544 return Op;
9545 Ops.push_back(Samp);
9546 }
9547 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9548 if (IsGFX10Plus)
9549 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9550 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9551 Ops.push_back(Unorm);
9552 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9553 Ops.push_back(IsA16 && // r128, a16 for gfx9
9554 ST->hasFeature(AMDGPU::FeatureR128A16)
9555 ? True
9556 : False);
9557 if (IsGFX10Plus)
9558 Ops.push_back(IsA16 ? True : False);
9559
9560 if (!Subtarget->hasGFX90AInsts())
9561 Ops.push_back(TFE); // tfe
9562 else if (TFE->getAsZExtVal()) {
9563 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9565 "TFE is not supported on this GPU", DL.getDebugLoc()));
9566 }
9567
9568 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9569 Ops.push_back(LWE); // lwe
9570 if (!IsGFX10Plus)
9571 Ops.push_back(DimInfo->DA ? True : False);
9572 if (BaseOpcode->HasD16)
9573 Ops.push_back(IsD16 ? True : False);
9574 if (isa<MemSDNode>(Op))
9575 Ops.push_back(Op.getOperand(0)); // chain
9576
9577 int NumVAddrDwords =
9578 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9579 int Opcode = -1;
9580
9581 if (IsGFX12Plus) {
9582 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9583 NumVDataDwords, NumVAddrDwords);
9584 } else if (IsGFX11Plus) {
9585 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9586 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9587 : AMDGPU::MIMGEncGfx11Default,
9588 NumVDataDwords, NumVAddrDwords);
9589 } else if (IsGFX10Plus) {
9590 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9591 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9592 : AMDGPU::MIMGEncGfx10Default,
9593 NumVDataDwords, NumVAddrDwords);
9594 } else {
9595 if (Subtarget->hasGFX90AInsts()) {
9596 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9597 NumVDataDwords, NumVAddrDwords);
9598 if (Opcode == -1) {
9599 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9601 "requested image instruction is not supported on this GPU",
9602 DL.getDebugLoc()));
9603
9604 unsigned Idx = 0;
9605 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9606 for (EVT VT : OrigResultTypes) {
9607 if (VT == MVT::Other)
9608 RetValues[Idx++] = Op.getOperand(0); // Chain
9609 else
9610 RetValues[Idx++] = DAG.getPOISON(VT);
9611 }
9612
9613 return DAG.getMergeValues(RetValues, DL);
9614 }
9615 }
9616 if (Opcode == -1 &&
9617 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9618 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9619 NumVDataDwords, NumVAddrDwords);
9620 if (Opcode == -1)
9621 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9622 NumVDataDwords, NumVAddrDwords);
9623 }
9624 if (Opcode == -1)
9625 return Op;
9626
9627 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9628 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9629 MachineMemOperand *MemRef = MemOp->getMemOperand();
9630 DAG.setNodeMemRefs(NewNode, {MemRef});
9631 }
9632
9633 if (BaseOpcode->NoReturn) {
9634 if (BaseOpcode->Atomic)
9635 return DAG.getMergeValues(
9636 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9637
9638 return SDValue(NewNode, 0);
9639 }
9640
9641 if (BaseOpcode->AtomicX2) {
9643 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9644 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9645 }
9646
9647 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9648 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9649 NumVDataDwords, IsAtomicPacked16Bit, DL);
9650}
9651
9652SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9653 SDValue Offset, SDValue CachePolicy,
9654 SelectionDAG &DAG) const {
9655 MachineFunction &MF = DAG.getMachineFunction();
9656
9657 const DataLayout &DataLayout = DAG.getDataLayout();
9658 Align Alignment =
9659 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9660
9661 MachineMemOperand *MMO = MF.getMachineMemOperand(
9662 MachinePointerInfo(),
9665 VT.getStoreSize(), Alignment);
9666
9667 if (!Offset->isDivergent()) {
9668 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9669
9670 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9671 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9672 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9673 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9674 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9675 SDValue BufferLoad =
9676 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
9677 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9678 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9679 }
9680
9681 // Widen vec3 load to vec4.
9682 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9683 !Subtarget->hasScalarDwordx3Loads()) {
9684 EVT WidenedVT =
9686 auto WidenedOp = DAG.getMemIntrinsicNode(
9687 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9688 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9689 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9690 DAG.getVectorIdxConstant(0, DL));
9691 return Subvector;
9692 }
9693
9694 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
9695 DAG.getVTList(VT), Ops, VT, MMO);
9696 }
9697
9698 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9699 // assume that the buffer is unswizzled.
9700 SDValue Ops[] = {
9701 DAG.getEntryNode(), // Chain
9702 Rsrc, // rsrc
9703 DAG.getConstant(0, DL, MVT::i32), // vindex
9704 {}, // voffset
9705 {}, // soffset
9706 {}, // offset
9707 CachePolicy, // cachepolicy
9708 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9709 };
9710 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9711 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9712 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9713 }
9714
9716 unsigned NumLoads = 1;
9717 MVT LoadVT = VT.getSimpleVT();
9718 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9719 assert((LoadVT.getScalarType() == MVT::i32 ||
9720 LoadVT.getScalarType() == MVT::f32));
9721
9722 if (NumElts == 8 || NumElts == 16) {
9723 NumLoads = NumElts / 4;
9724 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9725 }
9726
9727 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9728
9729 // Use the alignment to ensure that the required offsets will fit into the
9730 // immediate offsets.
9731 setBufferOffsets(Offset, DAG, &Ops[3],
9732 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9733
9734 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9735 for (unsigned i = 0; i < NumLoads; ++i) {
9736 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9737 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9738 LoadVT, MMO, DAG));
9739 }
9740
9741 if (NumElts == 8 || NumElts == 16)
9742 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9743
9744 return Loads[0];
9745}
9746
9747SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9748 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9749 if (!Subtarget->hasArchitectedSGPRs())
9750 return {};
9751 SDLoc SL(Op);
9752 MVT VT = MVT::i32;
9753 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9754 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9755 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9756}
9757
9758SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9759 AMDGPU::Hwreg::Id HwReg,
9760 unsigned LowBit,
9761 unsigned Width) const {
9762 SDLoc SL(Op);
9763 using namespace AMDGPU::Hwreg;
9764 return {DAG.getMachineNode(
9765 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9766 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9767 SL, MVT::i32)),
9768 0};
9769}
9770
9771SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9772 unsigned Dim,
9773 const ArgDescriptor &Arg) const {
9774 SDLoc SL(Op);
9775 MachineFunction &MF = DAG.getMachineFunction();
9776 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9777 if (MaxID == 0)
9778 return DAG.getConstant(0, SL, MVT::i32);
9779
9780 // It's undefined behavior if a function marked with the amdgpu-no-*
9781 // attributes uses the corresponding intrinsic.
9782 if (!Arg)
9783 return DAG.getPOISON(Op->getValueType(0));
9784
9785 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9786 SDLoc(DAG.getEntryNode()), Arg);
9787
9788 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9789 // masking operations anyway.
9790 //
9791 // TODO: We could assert the top bit is 0 for the source copy.
9792 if (Arg.isMasked())
9793 return Val;
9794
9795 // Preserve the known bits after expansion to a copy.
9796 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9797 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9798 DAG.getValueType(SmallVT));
9799}
9800
9801SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9802 SelectionDAG &DAG) const {
9803 MachineFunction &MF = DAG.getMachineFunction();
9804 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9805
9806 EVT VT = Op.getValueType();
9807 SDLoc DL(Op);
9808 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9809
9810 // TODO: Should this propagate fast-math-flags?
9811
9812 switch (IntrinsicID) {
9813 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9814 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9815 return emitNonHSAIntrinsicError(DAG, DL, VT);
9816 return getPreloadedValue(DAG, *MFI, VT,
9818 }
9819 case Intrinsic::amdgcn_dispatch_ptr:
9820 case Intrinsic::amdgcn_queue_ptr: {
9821 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9822 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9823 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9824 DL.getDebugLoc()));
9825 return DAG.getPOISON(VT);
9826 }
9827
9828 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9831 return getPreloadedValue(DAG, *MFI, VT, RegID);
9832 }
9833 case Intrinsic::amdgcn_implicitarg_ptr: {
9834 if (MFI->isEntryFunction())
9835 return getImplicitArgPtr(DAG, DL);
9836 return getPreloadedValue(DAG, *MFI, VT,
9838 }
9839 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9840 if (!AMDGPU::isKernel(MF.getFunction())) {
9841 // This only makes sense to call in a kernel, so just lower to null.
9842 return DAG.getConstant(0, DL, VT);
9843 }
9844
9845 return getPreloadedValue(DAG, *MFI, VT,
9847 }
9848 case Intrinsic::amdgcn_dispatch_id: {
9849 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9850 }
9851 case Intrinsic::amdgcn_rcp:
9852 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9853 case Intrinsic::amdgcn_rsq:
9854 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9855 case Intrinsic::amdgcn_rsq_legacy:
9856 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9857 return emitRemovedIntrinsicError(DAG, DL, VT);
9858 return SDValue();
9859 case Intrinsic::amdgcn_rcp_legacy:
9860 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9861 return emitRemovedIntrinsicError(DAG, DL, VT);
9862 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9863 case Intrinsic::amdgcn_rsq_clamp: {
9864 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9865 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9866
9867 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9868 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9869 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9870
9871 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9872 SDValue Tmp =
9873 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9874 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9875 DAG.getConstantFP(Min, DL, VT));
9876 }
9877 case Intrinsic::r600_read_ngroups_x:
9878 if (Subtarget->isAmdHsaOS())
9879 return emitNonHSAIntrinsicError(DAG, DL, VT);
9880
9881 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9883 false);
9884 case Intrinsic::r600_read_ngroups_y:
9885 if (Subtarget->isAmdHsaOS())
9886 return emitNonHSAIntrinsicError(DAG, DL, VT);
9887
9888 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9890 false);
9891 case Intrinsic::r600_read_ngroups_z:
9892 if (Subtarget->isAmdHsaOS())
9893 return emitNonHSAIntrinsicError(DAG, DL, VT);
9894
9895 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9897 false);
9898 case Intrinsic::r600_read_local_size_x:
9899 if (Subtarget->isAmdHsaOS())
9900 return emitNonHSAIntrinsicError(DAG, DL, VT);
9901
9902 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9904 case Intrinsic::r600_read_local_size_y:
9905 if (Subtarget->isAmdHsaOS())
9906 return emitNonHSAIntrinsicError(DAG, DL, VT);
9907
9908 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9910 case Intrinsic::r600_read_local_size_z:
9911 if (Subtarget->isAmdHsaOS())
9912 return emitNonHSAIntrinsicError(DAG, DL, VT);
9913
9914 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9916 case Intrinsic::amdgcn_workgroup_id_x:
9917 return lowerWorkGroupId(DAG, *MFI, VT,
9921 case Intrinsic::amdgcn_workgroup_id_y:
9922 return lowerWorkGroupId(DAG, *MFI, VT,
9926 case Intrinsic::amdgcn_workgroup_id_z:
9927 return lowerWorkGroupId(DAG, *MFI, VT,
9931 case Intrinsic::amdgcn_cluster_id_x:
9932 return Subtarget->hasClusters()
9933 ? getPreloadedValue(DAG, *MFI, VT,
9935 : DAG.getPOISON(VT);
9936 case Intrinsic::amdgcn_cluster_id_y:
9937 return Subtarget->hasClusters()
9938 ? getPreloadedValue(DAG, *MFI, VT,
9940 : DAG.getPOISON(VT);
9941 case Intrinsic::amdgcn_cluster_id_z:
9942 return Subtarget->hasClusters()
9943 ? getPreloadedValue(DAG, *MFI, VT,
9945 : DAG.getPOISON(VT);
9946 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9947 return Subtarget->hasClusters()
9948 ? getPreloadedValue(
9949 DAG, *MFI, VT,
9951 : DAG.getPOISON(VT);
9952 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9953 return Subtarget->hasClusters()
9954 ? getPreloadedValue(
9955 DAG, *MFI, VT,
9957 : DAG.getPOISON(VT);
9958 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9959 return Subtarget->hasClusters()
9960 ? getPreloadedValue(
9961 DAG, *MFI, VT,
9963 : DAG.getPOISON(VT);
9964 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9965 return Subtarget->hasClusters()
9966 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9967 : SDValue();
9968 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9969 return Subtarget->hasClusters()
9970 ? getPreloadedValue(
9971 DAG, *MFI, VT,
9973 : DAG.getPOISON(VT);
9974 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9975 return Subtarget->hasClusters()
9976 ? getPreloadedValue(
9977 DAG, *MFI, VT,
9979 : DAG.getPOISON(VT);
9980 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9981 return Subtarget->hasClusters()
9982 ? getPreloadedValue(
9983 DAG, *MFI, VT,
9985 : DAG.getPOISON(VT);
9986 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9987 return Subtarget->hasClusters()
9988 ? getPreloadedValue(
9989 DAG, *MFI, VT,
9991 : DAG.getPOISON(VT);
9992 case Intrinsic::amdgcn_wave_id:
9993 return lowerWaveID(DAG, Op);
9994 case Intrinsic::amdgcn_lds_kernel_id: {
9995 if (MFI->isEntryFunction())
9996 return getLDSKernelId(DAG, DL);
9997 return getPreloadedValue(DAG, *MFI, VT,
9999 }
10000 case Intrinsic::amdgcn_workitem_id_x:
10001 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10002 case Intrinsic::amdgcn_workitem_id_y:
10003 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10004 case Intrinsic::amdgcn_workitem_id_z:
10005 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10006 case Intrinsic::amdgcn_wavefrontsize:
10007 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10008 SDLoc(Op), MVT::i32);
10009 case Intrinsic::amdgcn_s_buffer_load: {
10010 unsigned CPol = Op.getConstantOperandVal(3);
10011 // s_buffer_load, because of how it's optimized, can't be volatile
10012 // so reject ones with the volatile bit set.
10013 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10016 return Op;
10017 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10018 Op.getOperand(3), DAG);
10019 }
10020 case Intrinsic::amdgcn_fdiv_fast:
10021 return lowerFDIV_FAST(Op, DAG);
10022 case Intrinsic::amdgcn_sin:
10023 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10024
10025 case Intrinsic::amdgcn_cos:
10026 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10027
10028 case Intrinsic::amdgcn_mul_u24:
10029 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10030 Op.getOperand(2));
10031 case Intrinsic::amdgcn_mul_i24:
10032 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10033 Op.getOperand(2));
10034
10035 case Intrinsic::amdgcn_log_clamp: {
10036 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10037 return SDValue();
10038
10039 return emitRemovedIntrinsicError(DAG, DL, VT);
10040 }
10041 case Intrinsic::amdgcn_fract:
10042 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10043
10044 case Intrinsic::amdgcn_class:
10045 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10046 Op.getOperand(2));
10047 case Intrinsic::amdgcn_div_fmas:
10048 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10049 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10050
10051 case Intrinsic::amdgcn_div_fixup:
10052 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10053 Op.getOperand(2), Op.getOperand(3));
10054
10055 case Intrinsic::amdgcn_div_scale: {
10056 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10057
10058 // Translate to the operands expected by the machine instruction. The
10059 // first parameter must be the same as the first instruction.
10060 SDValue Numerator = Op.getOperand(1);
10061 SDValue Denominator = Op.getOperand(2);
10062
10063 // Note this order is opposite of the machine instruction's operations,
10064 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10065 // intrinsic has the numerator as the first operand to match a normal
10066 // division operation.
10067
10068 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10069
10070 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10071 Denominator, Numerator);
10072 }
10073 case Intrinsic::amdgcn_icmp: {
10074 // There is a Pat that handles this variant, so return it as-is.
10075 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10076 Op.getConstantOperandVal(2) == 0 &&
10077 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10078 return Op;
10079 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10080 }
10081 case Intrinsic::amdgcn_fcmp: {
10082 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10083 }
10084 case Intrinsic::amdgcn_ballot:
10085 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10086 case Intrinsic::amdgcn_fmed3:
10087 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10088 Op.getOperand(2), Op.getOperand(3));
10089 case Intrinsic::amdgcn_fdot2:
10090 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10091 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10092 case Intrinsic::amdgcn_fmul_legacy:
10093 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10094 Op.getOperand(2));
10095 case Intrinsic::amdgcn_sffbh:
10096 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10097 case Intrinsic::amdgcn_sbfe:
10098 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10099 Op.getOperand(2), Op.getOperand(3));
10100 case Intrinsic::amdgcn_ubfe:
10101 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10102 Op.getOperand(2), Op.getOperand(3));
10103 case Intrinsic::amdgcn_cvt_pkrtz:
10104 case Intrinsic::amdgcn_cvt_pknorm_i16:
10105 case Intrinsic::amdgcn_cvt_pknorm_u16:
10106 case Intrinsic::amdgcn_cvt_pk_i16:
10107 case Intrinsic::amdgcn_cvt_pk_u16: {
10108 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10109 EVT VT = Op.getValueType();
10110 unsigned Opcode;
10111
10112 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10113 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10114 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10115 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10116 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10117 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10118 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10119 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10120 else
10121 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10122
10123 if (isTypeLegal(VT))
10124 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10125
10126 SDValue Node =
10127 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10128 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10129 }
10130 case Intrinsic::amdgcn_fmad_ftz:
10131 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10132 Op.getOperand(2), Op.getOperand(3));
10133
10134 case Intrinsic::amdgcn_if_break:
10135 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10136 Op->getOperand(1), Op->getOperand(2)),
10137 0);
10138
10139 case Intrinsic::amdgcn_groupstaticsize: {
10141 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10142 return Op;
10143
10144 const Module *M = MF.getFunction().getParent();
10145 const GlobalValue *GV =
10146 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10147 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10149 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10150 }
10151 case Intrinsic::amdgcn_is_shared:
10152 case Intrinsic::amdgcn_is_private: {
10153 SDLoc SL(Op);
10154 SDValue SrcVec =
10155 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10156 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10157 DAG.getConstant(1, SL, MVT::i32));
10158
10159 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10161 : AMDGPUAS::PRIVATE_ADDRESS;
10162 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10163 Subtarget->hasGloballyAddressableScratch()) {
10164 SDValue FlatScratchBaseHi(
10165 DAG.getMachineNode(
10166 AMDGPU::S_MOV_B32, DL, MVT::i32,
10167 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10168 0);
10169 // Test bits 63..58 against the aperture address.
10170 return DAG.getSetCC(
10171 SL, MVT::i1,
10172 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10173 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10174 }
10175
10176 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10177 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10178 }
10179 case Intrinsic::amdgcn_perm:
10180 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10181 Op.getOperand(2), Op.getOperand(3));
10182 case Intrinsic::amdgcn_reloc_constant: {
10183 Module *M = MF.getFunction().getParent();
10184 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10185 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10186 auto *RelocSymbol = cast<GlobalVariable>(
10187 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10188 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10190 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10191 }
10192 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10193 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10194 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10195 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10196 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10197 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10198 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10199 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10200 if (Op.getOperand(4).getValueType() == MVT::i32)
10201 return SDValue();
10202
10203 SDLoc SL(Op);
10204 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10205 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10206 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10207 Op.getOperand(3), IndexKeyi32);
10208 }
10209 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10210 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10211 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10212 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10213 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10214 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10215 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10216 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10217 if (Op.getOperand(4).getValueType() == MVT::i64)
10218 return SDValue();
10219
10220 SDLoc SL(Op);
10221 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10222 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10223 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10224 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10225 Op.getOperand(6)});
10226 }
10227 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10228 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10229 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10230 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10231 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10232 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10233 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10234 ? MVT::i64
10235 : MVT::i32;
10236 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10237 return SDValue();
10238
10239 SDLoc SL(Op);
10240 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10241 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10242 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10243 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10244 IndexKey, Op.getOperand(7),
10245 Op.getOperand(8)}); // No clamp operand
10246 }
10247 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10248 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10249 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10250 if (Op.getOperand(6).getValueType() == MVT::i32)
10251 return SDValue();
10252
10253 SDLoc SL(Op);
10254 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10255 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10256 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10257 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10258 IndexKeyi32, Op.getOperand(7)});
10259 }
10260 case Intrinsic::amdgcn_addrspacecast_nonnull:
10261 return lowerADDRSPACECAST(Op, DAG);
10262 case Intrinsic::amdgcn_readlane:
10263 case Intrinsic::amdgcn_readfirstlane:
10264 case Intrinsic::amdgcn_writelane:
10265 case Intrinsic::amdgcn_permlane16:
10266 case Intrinsic::amdgcn_permlanex16:
10267 case Intrinsic::amdgcn_permlane64:
10268 case Intrinsic::amdgcn_set_inactive:
10269 case Intrinsic::amdgcn_set_inactive_chain_arg:
10270 case Intrinsic::amdgcn_mov_dpp8:
10271 case Intrinsic::amdgcn_update_dpp:
10272 return lowerLaneOp(*this, Op.getNode(), DAG);
10273 case Intrinsic::amdgcn_dead: {
10275 for (const EVT ValTy : Op.getNode()->values())
10276 Poisons.push_back(DAG.getPOISON(ValTy));
10277 return DAG.getMergeValues(Poisons, SDLoc(Op));
10278 }
10279 default:
10280 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10282 return lowerImage(Op, ImageDimIntr, DAG, false);
10283
10284 return Op;
10285 }
10286}
10287
10288// On targets not supporting constant in soffset field, turn zero to
10289// SGPR_NULL to avoid generating an extra s_mov with zero.
10291 const GCNSubtarget *Subtarget) {
10292 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10293 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10294 return SOffset;
10295}
10296
10297SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10298 SelectionDAG &DAG,
10299 unsigned NewOpcode) const {
10300 SDLoc DL(Op);
10301
10302 SDValue VData = Op.getOperand(2);
10303 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10304 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10305 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10306 SDValue Ops[] = {
10307 Op.getOperand(0), // Chain
10308 VData, // vdata
10309 Rsrc, // rsrc
10310 DAG.getConstant(0, DL, MVT::i32), // vindex
10311 VOffset, // voffset
10312 SOffset, // soffset
10313 Offset, // offset
10314 Op.getOperand(6), // cachepolicy
10315 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10316 };
10317
10318 auto *M = cast<MemSDNode>(Op);
10319
10320 EVT MemVT = VData.getValueType();
10321 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10322 M->getMemOperand());
10323}
10324
10325SDValue
10326SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10327 unsigned NewOpcode) const {
10328 SDLoc DL(Op);
10329
10330 SDValue VData = Op.getOperand(2);
10331 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10332 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10333 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10334 SDValue Ops[] = {
10335 Op.getOperand(0), // Chain
10336 VData, // vdata
10337 Rsrc, // rsrc
10338 Op.getOperand(4), // vindex
10339 VOffset, // voffset
10340 SOffset, // soffset
10341 Offset, // offset
10342 Op.getOperand(7), // cachepolicy
10343 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10344 };
10345
10346 auto *M = cast<MemSDNode>(Op);
10347
10348 EVT MemVT = VData.getValueType();
10349 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10350 M->getMemOperand());
10351}
10352
10353SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10354 SelectionDAG &DAG) const {
10355 unsigned IntrID = Op.getConstantOperandVal(1);
10356 SDLoc DL(Op);
10357
10358 switch (IntrID) {
10359 case Intrinsic::amdgcn_ds_ordered_add:
10360 case Intrinsic::amdgcn_ds_ordered_swap: {
10361 MemSDNode *M = cast<MemSDNode>(Op);
10362 SDValue Chain = M->getOperand(0);
10363 SDValue M0 = M->getOperand(2);
10364 SDValue Value = M->getOperand(3);
10365 unsigned IndexOperand = M->getConstantOperandVal(7);
10366 unsigned WaveRelease = M->getConstantOperandVal(8);
10367 unsigned WaveDone = M->getConstantOperandVal(9);
10368
10369 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10370 IndexOperand &= ~0x3f;
10371 unsigned CountDw = 0;
10372
10373 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10374 CountDw = (IndexOperand >> 24) & 0xf;
10375 IndexOperand &= ~(0xf << 24);
10376
10377 if (CountDw < 1 || CountDw > 4) {
10378 const Function &Fn = DAG.getMachineFunction().getFunction();
10379 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10380 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10381 DL.getDebugLoc()));
10382 CountDw = 1;
10383 }
10384 }
10385
10386 if (IndexOperand) {
10387 const Function &Fn = DAG.getMachineFunction().getFunction();
10388 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10389 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10390 }
10391
10392 if (WaveDone && !WaveRelease) {
10393 // TODO: Move this to IR verifier
10394 const Function &Fn = DAG.getMachineFunction().getFunction();
10395 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10396 Fn, "ds_ordered_count: wave_done requires wave_release",
10397 DL.getDebugLoc()));
10398 }
10399
10400 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10401 unsigned ShaderType =
10403 unsigned Offset0 = OrderedCountIndex << 2;
10404 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10405
10406 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10407 Offset1 |= (CountDw - 1) << 6;
10408
10409 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10410 Offset1 |= ShaderType << 2;
10411
10412 unsigned Offset = Offset0 | (Offset1 << 8);
10413
10414 SDValue Ops[] = {
10415 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10416 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10417 };
10418 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10419 M->getVTList(), Ops, M->getMemoryVT(),
10420 M->getMemOperand());
10421 }
10422 case Intrinsic::amdgcn_raw_buffer_load:
10423 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10424 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10425 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10426 case Intrinsic::amdgcn_raw_buffer_load_format:
10427 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10428 const bool IsFormat =
10429 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10430 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10431
10432 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10433 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10434 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10435 SDValue Ops[] = {
10436 Op.getOperand(0), // Chain
10437 Rsrc, // rsrc
10438 DAG.getConstant(0, DL, MVT::i32), // vindex
10439 VOffset, // voffset
10440 SOffset, // soffset
10441 Offset, // offset
10442 Op.getOperand(5), // cachepolicy, swizzled buffer
10443 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10444 };
10445
10446 auto *M = cast<MemSDNode>(Op);
10447 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10448 }
10449 case Intrinsic::amdgcn_struct_buffer_load:
10450 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10451 case Intrinsic::amdgcn_struct_buffer_load_format:
10452 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10453 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10454 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10455 const bool IsFormat =
10456 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10457 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10458
10459 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10460 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10461 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10462 SDValue Ops[] = {
10463 Op.getOperand(0), // Chain
10464 Rsrc, // rsrc
10465 Op.getOperand(3), // vindex
10466 VOffset, // voffset
10467 SOffset, // soffset
10468 Offset, // offset
10469 Op.getOperand(6), // cachepolicy, swizzled buffer
10470 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10471 };
10472
10473 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10474 }
10475 case Intrinsic::amdgcn_raw_tbuffer_load:
10476 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10477 MemSDNode *M = cast<MemSDNode>(Op);
10478 EVT LoadVT = Op.getValueType();
10479 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10480 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10481 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10482
10483 SDValue Ops[] = {
10484 Op.getOperand(0), // Chain
10485 Rsrc, // rsrc
10486 DAG.getConstant(0, DL, MVT::i32), // vindex
10487 VOffset, // voffset
10488 SOffset, // soffset
10489 Offset, // offset
10490 Op.getOperand(5), // format
10491 Op.getOperand(6), // cachepolicy, swizzled buffer
10492 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10493 };
10494
10495 if (LoadVT.getScalarType() == MVT::f16)
10496 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10497 Ops);
10498 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10499 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10500 DAG);
10501 }
10502 case Intrinsic::amdgcn_struct_tbuffer_load:
10503 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10504 MemSDNode *M = cast<MemSDNode>(Op);
10505 EVT LoadVT = Op.getValueType();
10506 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10507 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10508 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10509
10510 SDValue Ops[] = {
10511 Op.getOperand(0), // Chain
10512 Rsrc, // rsrc
10513 Op.getOperand(3), // vindex
10514 VOffset, // voffset
10515 SOffset, // soffset
10516 Offset, // offset
10517 Op.getOperand(6), // format
10518 Op.getOperand(7), // cachepolicy, swizzled buffer
10519 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10520 };
10521
10522 if (LoadVT.getScalarType() == MVT::f16)
10523 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10524 Ops);
10525 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10526 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10527 DAG);
10528 }
10529 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10530 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10531 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10532 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10533 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10534 return lowerStructBufferAtomicIntrin(Op, DAG,
10535 AMDGPUISD::BUFFER_ATOMIC_FADD);
10536 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10537 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10538 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10539 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10540 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10541 return lowerStructBufferAtomicIntrin(Op, DAG,
10542 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10543 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10544 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10545 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10546 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10547 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10548 return lowerStructBufferAtomicIntrin(Op, DAG,
10549 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10550 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10551 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10552 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10553 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10554 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10555 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10556 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10557 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10558 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10559 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10560 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10561 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10562 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10563 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10564 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10565 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10566 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10567 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10568 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10569 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10570 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10571 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10572 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10573 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10574 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10575 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10576 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10577 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10578 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10579 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10580 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10581 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10582 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10583 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10584 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10585 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10586 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10587 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10588 return lowerStructBufferAtomicIntrin(Op, DAG,
10589 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10590 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10591 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10592 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10593 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10594 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10595 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10596 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10597 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10598 return lowerStructBufferAtomicIntrin(Op, DAG,
10599 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10600 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10601 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10602 return lowerStructBufferAtomicIntrin(Op, DAG,
10603 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10604 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10605 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10606 return lowerStructBufferAtomicIntrin(Op, DAG,
10607 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10608 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10609 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10610 return lowerStructBufferAtomicIntrin(Op, DAG,
10611 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10612 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10613 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10614 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10615 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10616 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10617 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10618 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10620 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10621 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10622 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10623 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10624 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10625 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10626 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10627 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10628 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10629 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10630 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10631 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10632 return lowerStructBufferAtomicIntrin(Op, DAG,
10633 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10634 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10635 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10636 return lowerRawBufferAtomicIntrin(Op, DAG,
10637 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10638 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10640 return lowerStructBufferAtomicIntrin(Op, DAG,
10641 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10642 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10643 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10644 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10645 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10646 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10647 SDValue Ops[] = {
10648 Op.getOperand(0), // Chain
10649 Op.getOperand(2), // src
10650 Op.getOperand(3), // cmp
10651 Rsrc, // rsrc
10652 DAG.getConstant(0, DL, MVT::i32), // vindex
10653 VOffset, // voffset
10654 SOffset, // soffset
10655 Offset, // offset
10656 Op.getOperand(7), // cachepolicy
10657 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10658 };
10659 EVT VT = Op.getValueType();
10660 auto *M = cast<MemSDNode>(Op);
10661
10662 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10663 Op->getVTList(), Ops, VT,
10664 M->getMemOperand());
10665 }
10666 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10667 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10668 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10669 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10670 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10671 SDValue Ops[] = {
10672 Op.getOperand(0), // Chain
10673 Op.getOperand(2), // src
10674 Op.getOperand(3), // cmp
10675 Rsrc, // rsrc
10676 Op.getOperand(5), // vindex
10677 VOffset, // voffset
10678 SOffset, // soffset
10679 Offset, // offset
10680 Op.getOperand(8), // cachepolicy
10681 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10682 };
10683 EVT VT = Op.getValueType();
10684 auto *M = cast<MemSDNode>(Op);
10685
10686 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10687 Op->getVTList(), Ops, VT,
10688 M->getMemOperand());
10689 }
10690 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10691 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10692 MemSDNode *M = cast<MemSDNode>(Op);
10693 SDValue NodePtr = M->getOperand(2);
10694 SDValue RayExtent = M->getOperand(3);
10695 SDValue InstanceMask = M->getOperand(4);
10696 SDValue RayOrigin = M->getOperand(5);
10697 SDValue RayDir = M->getOperand(6);
10698 SDValue Offsets = M->getOperand(7);
10699 SDValue TDescr = M->getOperand(8);
10700
10701 assert(NodePtr.getValueType() == MVT::i64);
10702 assert(RayDir.getValueType() == MVT::v3f32);
10703
10704 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10705 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10706 return SDValue();
10707 }
10708
10709 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10710 const unsigned NumVDataDwords = 10;
10711 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10712 int Opcode = AMDGPU::getMIMGOpcode(
10713 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10714 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10715 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10716 assert(Opcode != -1);
10717
10719 Ops.push_back(NodePtr);
10720 Ops.push_back(DAG.getBuildVector(
10721 MVT::v2i32, DL,
10722 {DAG.getBitcast(MVT::i32, RayExtent),
10723 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10724 Ops.push_back(RayOrigin);
10725 Ops.push_back(RayDir);
10726 Ops.push_back(Offsets);
10727 Ops.push_back(TDescr);
10728 Ops.push_back(M->getChain());
10729
10730 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10731 MachineMemOperand *MemRef = M->getMemOperand();
10732 DAG.setNodeMemRefs(NewNode, {MemRef});
10733 return SDValue(NewNode, 0);
10734 }
10735 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10736 MemSDNode *M = cast<MemSDNode>(Op);
10737 SDValue NodePtr = M->getOperand(2);
10738 SDValue RayExtent = M->getOperand(3);
10739 SDValue RayOrigin = M->getOperand(4);
10740 SDValue RayDir = M->getOperand(5);
10741 SDValue RayInvDir = M->getOperand(6);
10742 SDValue TDescr = M->getOperand(7);
10743
10744 assert(NodePtr.getValueType() == MVT::i32 ||
10745 NodePtr.getValueType() == MVT::i64);
10746 assert(RayDir.getValueType() == MVT::v3f16 ||
10747 RayDir.getValueType() == MVT::v3f32);
10748
10749 if (!Subtarget->hasGFX10_AEncoding()) {
10750 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10751 return SDValue();
10752 }
10753
10754 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10755 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10756 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10757 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10758 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10759 const unsigned NumVDataDwords = 4;
10760 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10761 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10762 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10763 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10764 IsGFX12Plus;
10765 const unsigned BaseOpcodes[2][2] = {
10766 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10767 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10768 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10769 int Opcode;
10770 if (UseNSA) {
10771 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10772 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10773 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10774 : AMDGPU::MIMGEncGfx10NSA,
10775 NumVDataDwords, NumVAddrDwords);
10776 } else {
10777 assert(!IsGFX12Plus);
10778 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10779 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10780 : AMDGPU::MIMGEncGfx10Default,
10781 NumVDataDwords, NumVAddrDwords);
10782 }
10783 assert(Opcode != -1);
10784
10786
10787 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10789 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10790 if (Lanes[0].getValueSizeInBits() == 32) {
10791 for (unsigned I = 0; I < 3; ++I)
10792 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10793 } else {
10794 if (IsAligned) {
10795 Ops.push_back(DAG.getBitcast(
10796 MVT::i32,
10797 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10798 Ops.push_back(Lanes[2]);
10799 } else {
10800 SDValue Elt0 = Ops.pop_back_val();
10801 Ops.push_back(DAG.getBitcast(
10802 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10803 Ops.push_back(DAG.getBitcast(
10804 MVT::i32,
10805 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10806 }
10807 }
10808 };
10809
10810 if (UseNSA && IsGFX11Plus) {
10811 Ops.push_back(NodePtr);
10812 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10813 Ops.push_back(RayOrigin);
10814 if (IsA16) {
10815 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10816 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10817 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10818 for (unsigned I = 0; I < 3; ++I) {
10819 MergedLanes.push_back(DAG.getBitcast(
10820 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10821 {DirLanes[I], InvDirLanes[I]})));
10822 }
10823 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10824 } else {
10825 Ops.push_back(RayDir);
10826 Ops.push_back(RayInvDir);
10827 }
10828 } else {
10829 if (Is64)
10830 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10831 2);
10832 else
10833 Ops.push_back(NodePtr);
10834
10835 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10836 packLanes(RayOrigin, true);
10837 packLanes(RayDir, true);
10838 packLanes(RayInvDir, false);
10839 }
10840
10841 if (!UseNSA) {
10842 // Build a single vector containing all the operands so far prepared.
10843 if (NumVAddrDwords > 12) {
10844 SDValue Undef = DAG.getPOISON(MVT::i32);
10845 Ops.append(16 - Ops.size(), Undef);
10846 }
10847 assert(Ops.size() >= 8 && Ops.size() <= 12);
10848 SDValue MergedOps =
10849 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10850 Ops.clear();
10851 Ops.push_back(MergedOps);
10852 }
10853
10854 Ops.push_back(TDescr);
10855 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10856 Ops.push_back(M->getChain());
10857
10858 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10859 MachineMemOperand *MemRef = M->getMemOperand();
10860 DAG.setNodeMemRefs(NewNode, {MemRef});
10861 return SDValue(NewNode, 0);
10862 }
10863 case Intrinsic::amdgcn_global_atomic_fmin_num:
10864 case Intrinsic::amdgcn_global_atomic_fmax_num:
10865 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10866 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10867 MemSDNode *M = cast<MemSDNode>(Op);
10868 SDValue Ops[] = {
10869 M->getOperand(0), // Chain
10870 M->getOperand(2), // Ptr
10871 M->getOperand(3) // Value
10872 };
10873 unsigned Opcode = 0;
10874 switch (IntrID) {
10875 case Intrinsic::amdgcn_global_atomic_fmin_num:
10876 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10877 Opcode = ISD::ATOMIC_LOAD_FMIN;
10878 break;
10879 }
10880 case Intrinsic::amdgcn_global_atomic_fmax_num:
10881 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10882 Opcode = ISD::ATOMIC_LOAD_FMAX;
10883 break;
10884 }
10885 default:
10886 llvm_unreachable("unhandled atomic opcode");
10887 }
10888 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10889 Ops, M->getMemOperand());
10890 }
10891 case Intrinsic::amdgcn_s_get_barrier_state:
10892 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10893 SDValue Chain = Op->getOperand(0);
10895 unsigned Opc;
10896
10897 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10898 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10899 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10900 BarID = (BarID >> 4) & 0x3F;
10901 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10902 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10903 Ops.push_back(K);
10904 Ops.push_back(Chain);
10905 } else {
10906 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10907 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10908 SDValue M0Val;
10909 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10910 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10911 M0Val = SDValue(
10912 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10913 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10914 0);
10915 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10916 } else
10917 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10918 }
10919
10920 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10921 return SDValue(NewMI, 0);
10922 }
10923 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10924 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10925 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10926 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10927 SDValue Chain = Op->getOperand(0);
10928 SDValue Ptr = Op->getOperand(2);
10929 EVT VT = Op->getValueType(0);
10930 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10931 Chain, Ptr, MII->getMemOperand());
10932 }
10933 default:
10934
10935 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10937 return lowerImage(Op, ImageDimIntr, DAG, true);
10938
10939 return SDValue();
10940 }
10941}
10942
10943// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10944// dwordx4 if on SI and handle TFE loads.
10945SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10946 SDVTList VTList,
10947 ArrayRef<SDValue> Ops, EVT MemVT,
10948 MachineMemOperand *MMO,
10949 SelectionDAG &DAG) const {
10950 LLVMContext &C = *DAG.getContext();
10951 MachineFunction &MF = DAG.getMachineFunction();
10952 EVT VT = VTList.VTs[0];
10953
10954 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10955 bool IsTFE = VTList.NumVTs == 3;
10956 if (IsTFE) {
10957 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10958 unsigned NumOpDWords = NumValueDWords + 1;
10959 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10960 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10961 MachineMemOperand *OpDWordsMMO =
10962 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10963 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10964 OpDWordsVT, OpDWordsMMO, DAG);
10965 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10966 DAG.getVectorIdxConstant(NumValueDWords, DL));
10967 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10968 SDValue ValueDWords =
10969 NumValueDWords == 1
10970 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10972 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10973 ZeroIdx);
10974 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10975 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10976 }
10977
10978 if (!Subtarget->hasDwordx3LoadStores() &&
10979 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10980 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10981 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10982 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10983 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10984 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10985 WidenedMemVT, WidenedMMO);
10987 DAG.getVectorIdxConstant(0, DL));
10988 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10989 }
10990
10991 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10992}
10993
10994SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10995 bool ImageStore) const {
10996 EVT StoreVT = VData.getValueType();
10997
10998 // No change for f16 and legal vector D16 types.
10999 if (!StoreVT.isVector())
11000 return VData;
11001
11002 SDLoc DL(VData);
11003 unsigned NumElements = StoreVT.getVectorNumElements();
11004
11005 if (Subtarget->hasUnpackedD16VMem()) {
11006 // We need to unpack the packed data to store.
11007 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11008 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11009
11010 EVT EquivStoreVT =
11011 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11012 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11013 return DAG.UnrollVectorOp(ZExt.getNode());
11014 }
11015
11016 // The sq block of gfx8.1 does not estimate register use correctly for d16
11017 // image store instructions. The data operand is computed as if it were not a
11018 // d16 image instruction.
11019 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11020 // Bitcast to i16
11021 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11022 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11023
11024 // Decompose into scalars
11026 DAG.ExtractVectorElements(IntVData, Elts);
11027
11028 // Group pairs of i16 into v2i16 and bitcast to i32
11029 SmallVector<SDValue, 4> PackedElts;
11030 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11031 SDValue Pair =
11032 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11033 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11034 PackedElts.push_back(IntPair);
11035 }
11036 if ((NumElements % 2) == 1) {
11037 // Handle v3i16
11038 unsigned I = Elts.size() / 2;
11039 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11040 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11041 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11042 PackedElts.push_back(IntPair);
11043 }
11044
11045 // Pad using UNDEF
11046 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11047
11048 // Build final vector
11049 EVT VecVT =
11050 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11051 return DAG.getBuildVector(VecVT, DL, PackedElts);
11052 }
11053
11054 if (NumElements == 3) {
11055 EVT IntStoreVT =
11057 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11058
11059 EVT WidenedStoreVT = EVT::getVectorVT(
11060 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11061 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11062 WidenedStoreVT.getStoreSizeInBits());
11063 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11064 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11065 }
11066
11067 assert(isTypeLegal(StoreVT));
11068 return VData;
11069}
11070
11071SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11072 SelectionDAG &DAG) const {
11073 SDLoc DL(Op);
11074 SDValue Chain = Op.getOperand(0);
11075 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11076 MachineFunction &MF = DAG.getMachineFunction();
11077
11078 switch (IntrinsicID) {
11079 case Intrinsic::amdgcn_exp_compr: {
11080 if (!Subtarget->hasCompressedExport()) {
11081 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11083 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11084 }
11085 SDValue Src0 = Op.getOperand(4);
11086 SDValue Src1 = Op.getOperand(5);
11087 // Hack around illegal type on SI by directly selecting it.
11088 if (isTypeLegal(Src0.getValueType()))
11089 return SDValue();
11090
11091 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11092 SDValue Undef = DAG.getPOISON(MVT::f32);
11093 const SDValue Ops[] = {
11094 Op.getOperand(2), // tgt
11095 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11096 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11097 Undef, // src2
11098 Undef, // src3
11099 Op.getOperand(7), // vm
11100 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11101 Op.getOperand(3), // en
11102 Op.getOperand(0) // Chain
11103 };
11104
11105 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11106 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11107 }
11108
11109 case Intrinsic::amdgcn_struct_tbuffer_store:
11110 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11111 SDValue VData = Op.getOperand(2);
11112 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11113 if (IsD16)
11114 VData = handleD16VData(VData, DAG);
11115 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11116 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11117 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11118 SDValue Ops[] = {
11119 Chain,
11120 VData, // vdata
11121 Rsrc, // rsrc
11122 Op.getOperand(4), // vindex
11123 VOffset, // voffset
11124 SOffset, // soffset
11125 Offset, // offset
11126 Op.getOperand(7), // format
11127 Op.getOperand(8), // cachepolicy, swizzled buffer
11128 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11129 };
11130 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11131 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11132 MemSDNode *M = cast<MemSDNode>(Op);
11133 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11134 M->getMemoryVT(), M->getMemOperand());
11135 }
11136
11137 case Intrinsic::amdgcn_raw_tbuffer_store:
11138 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11139 SDValue VData = Op.getOperand(2);
11140 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11141 if (IsD16)
11142 VData = handleD16VData(VData, DAG);
11143 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11144 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11145 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11146 SDValue Ops[] = {
11147 Chain,
11148 VData, // vdata
11149 Rsrc, // rsrc
11150 DAG.getConstant(0, DL, MVT::i32), // vindex
11151 VOffset, // voffset
11152 SOffset, // soffset
11153 Offset, // offset
11154 Op.getOperand(6), // format
11155 Op.getOperand(7), // cachepolicy, swizzled buffer
11156 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11157 };
11158 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11159 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11160 MemSDNode *M = cast<MemSDNode>(Op);
11161 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11162 M->getMemoryVT(), M->getMemOperand());
11163 }
11164
11165 case Intrinsic::amdgcn_raw_buffer_store:
11166 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11167 case Intrinsic::amdgcn_raw_buffer_store_format:
11168 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11169 const bool IsFormat =
11170 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11171 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11172
11173 SDValue VData = Op.getOperand(2);
11174 EVT VDataVT = VData.getValueType();
11175 EVT EltType = VDataVT.getScalarType();
11176 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11177 if (IsD16) {
11178 VData = handleD16VData(VData, DAG);
11179 VDataVT = VData.getValueType();
11180 }
11181
11182 if (!isTypeLegal(VDataVT)) {
11183 VData =
11184 DAG.getNode(ISD::BITCAST, DL,
11185 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11186 }
11187
11188 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11189 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11190 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11191 SDValue Ops[] = {
11192 Chain,
11193 VData,
11194 Rsrc,
11195 DAG.getConstant(0, DL, MVT::i32), // vindex
11196 VOffset, // voffset
11197 SOffset, // soffset
11198 Offset, // offset
11199 Op.getOperand(6), // cachepolicy, swizzled buffer
11200 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11201 };
11202 unsigned Opc =
11203 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11204 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11205 MemSDNode *M = cast<MemSDNode>(Op);
11206
11207 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11208 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11209 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11210
11211 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11212 M->getMemoryVT(), M->getMemOperand());
11213 }
11214
11215 case Intrinsic::amdgcn_struct_buffer_store:
11216 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11217 case Intrinsic::amdgcn_struct_buffer_store_format:
11218 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11219 const bool IsFormat =
11220 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11221 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11222
11223 SDValue VData = Op.getOperand(2);
11224 EVT VDataVT = VData.getValueType();
11225 EVT EltType = VDataVT.getScalarType();
11226 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11227
11228 if (IsD16) {
11229 VData = handleD16VData(VData, DAG);
11230 VDataVT = VData.getValueType();
11231 }
11232
11233 if (!isTypeLegal(VDataVT)) {
11234 VData =
11235 DAG.getNode(ISD::BITCAST, DL,
11236 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11237 }
11238
11239 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11240 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11241 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11242 SDValue Ops[] = {
11243 Chain,
11244 VData,
11245 Rsrc,
11246 Op.getOperand(4), // vindex
11247 VOffset, // voffset
11248 SOffset, // soffset
11249 Offset, // offset
11250 Op.getOperand(7), // cachepolicy, swizzled buffer
11251 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11252 };
11253 unsigned Opc =
11254 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11255 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11256 MemSDNode *M = cast<MemSDNode>(Op);
11257
11258 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11259 EVT VDataType = VData.getValueType().getScalarType();
11260 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11261 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11262
11263 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11264 M->getMemoryVT(), M->getMemOperand());
11265 }
11266 case Intrinsic::amdgcn_raw_buffer_load_lds:
11267 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11268 case Intrinsic::amdgcn_struct_buffer_load_lds:
11269 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11270 if (!Subtarget->hasVMemToLDSLoad())
11271 return SDValue();
11272 unsigned Opc;
11273 bool HasVIndex =
11274 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11275 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11276 unsigned OpOffset = HasVIndex ? 1 : 0;
11277 SDValue VOffset = Op.getOperand(5 + OpOffset);
11278 bool HasVOffset = !isNullConstant(VOffset);
11279 unsigned Size = Op->getConstantOperandVal(4);
11280
11281 switch (Size) {
11282 default:
11283 return SDValue();
11284 case 1:
11285 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11286 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11287 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11288 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11289 break;
11290 case 2:
11291 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11292 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11293 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11294 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11295 break;
11296 case 4:
11297 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11298 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11299 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11300 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11301 break;
11302 case 12:
11303 if (!Subtarget->hasLDSLoadB96_B128())
11304 return SDValue();
11305 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11306 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11307 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11308 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11309 break;
11310 case 16:
11311 if (!Subtarget->hasLDSLoadB96_B128())
11312 return SDValue();
11313 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11314 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11315 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11316 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11317 break;
11318 }
11319
11320 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11321
11323
11324 if (HasVIndex && HasVOffset)
11325 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11326 {Op.getOperand(5), // VIndex
11327 VOffset}));
11328 else if (HasVIndex)
11329 Ops.push_back(Op.getOperand(5));
11330 else if (HasVOffset)
11331 Ops.push_back(VOffset);
11332
11333 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11334 Ops.push_back(Rsrc);
11335 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11336 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11337 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11338 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11339 Ops.push_back(DAG.getTargetConstant(
11340 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11341 DL, MVT::i8)); // cpol
11342 Ops.push_back(DAG.getTargetConstant(
11343 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11344 ? 1
11345 : 0,
11346 DL, MVT::i8)); // swz
11347 Ops.push_back(M0Val.getValue(0)); // Chain
11348 Ops.push_back(M0Val.getValue(1)); // Glue
11349
11350 auto *M = cast<MemSDNode>(Op);
11351 MachineMemOperand *LoadMMO = M->getMemOperand();
11352 // Don't set the offset value here because the pointer points to the base of
11353 // the buffer.
11354 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11355
11356 MachinePointerInfo StorePtrI = LoadPtrI;
11357 LoadPtrI.V = PoisonValue::get(
11361
11362 auto F = LoadMMO->getFlags() &
11364 LoadMMO =
11366 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11367
11368 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11369 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11370 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11371
11372 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11373 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11374
11375 return SDValue(Load, 0);
11376 }
11377 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11378 // for "trust me" that the remaining cases are global pointers until
11379 // such time as we can put two mem operands on an intrinsic.
11380 case Intrinsic::amdgcn_load_to_lds:
11381 case Intrinsic::amdgcn_global_load_lds: {
11382 if (!Subtarget->hasVMemToLDSLoad())
11383 return SDValue();
11384
11385 unsigned Opc;
11386 unsigned Size = Op->getConstantOperandVal(4);
11387 switch (Size) {
11388 default:
11389 return SDValue();
11390 case 1:
11391 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11392 break;
11393 case 2:
11394 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11395 break;
11396 case 4:
11397 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11398 break;
11399 case 12:
11400 if (!Subtarget->hasLDSLoadB96_B128())
11401 return SDValue();
11402 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11403 break;
11404 case 16:
11405 if (!Subtarget->hasLDSLoadB96_B128())
11406 return SDValue();
11407 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11408 break;
11409 }
11410
11411 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11412
11414
11415 SDValue Addr = Op.getOperand(2); // Global ptr
11416 SDValue VOffset;
11417 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11418 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11419 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11420 SDValue LHS = Addr.getOperand(0);
11421 SDValue RHS = Addr.getOperand(1);
11422
11423 if (LHS->isDivergent())
11424 std::swap(LHS, RHS);
11425
11426 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11427 RHS.getOperand(0).getValueType() == MVT::i32) {
11428 // add (i64 sgpr), (zero_extend (i32 vgpr))
11429 Addr = LHS;
11430 VOffset = RHS.getOperand(0);
11431 }
11432 }
11433
11434 Ops.push_back(Addr);
11435 if (!Addr->isDivergent()) {
11437 if (!VOffset)
11438 VOffset =
11439 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11440 DAG.getTargetConstant(0, DL, MVT::i32)),
11441 0);
11442 Ops.push_back(VOffset);
11443 }
11444
11445 Ops.push_back(Op.getOperand(5)); // Offset
11446
11447 unsigned Aux = Op.getConstantOperandVal(6);
11448 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11449 MVT::i32)); // CPol
11450
11451 Ops.push_back(M0Val.getValue(0)); // Chain
11452 Ops.push_back(M0Val.getValue(1)); // Glue
11453
11454 auto *M = cast<MemSDNode>(Op);
11455 MachineMemOperand *LoadMMO = M->getMemOperand();
11456 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11457 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11458 MachinePointerInfo StorePtrI = LoadPtrI;
11459 LoadPtrI.V = PoisonValue::get(
11463 auto F = LoadMMO->getFlags() &
11465 LoadMMO =
11467 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11468 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11469 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11470 LoadMMO->getAAInfo());
11471
11472 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11473 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11474
11475 return SDValue(Load, 0);
11476 }
11477 case Intrinsic::amdgcn_end_cf:
11478 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11479 Op->getOperand(2), Chain),
11480 0);
11481 case Intrinsic::amdgcn_s_barrier_init:
11482 case Intrinsic::amdgcn_s_barrier_signal_var: {
11483 // these two intrinsics have two operands: barrier pointer and member count
11484 SDValue Chain = Op->getOperand(0);
11486 SDValue BarOp = Op->getOperand(2);
11487 SDValue CntOp = Op->getOperand(3);
11488 SDValue M0Val;
11489 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11490 ? AMDGPU::S_BARRIER_INIT_M0
11491 : AMDGPU::S_BARRIER_SIGNAL_M0;
11492 // extract the BarrierID from bits 4-9 of BarOp
11493 SDValue BarID;
11494 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11495 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11496 BarID =
11497 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11498 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11499 0);
11500 // Member count should be put into M0[ShAmt:+6]
11501 // Barrier ID should be put into M0[5:0]
11502 M0Val =
11503 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11504 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11505 0);
11506 constexpr unsigned ShAmt = 16;
11507 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11508 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11509
11510 M0Val = SDValue(
11511 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11512
11513 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11514
11515 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11516 return SDValue(NewMI, 0);
11517 }
11518 case Intrinsic::amdgcn_s_wakeup_barrier: {
11519 if (!Subtarget->hasSWakeupBarrier())
11520 return SDValue();
11521 [[fallthrough]];
11522 }
11523 case Intrinsic::amdgcn_s_barrier_join: {
11524 // these three intrinsics have one operand: barrier pointer
11525 SDValue Chain = Op->getOperand(0);
11527 SDValue BarOp = Op->getOperand(2);
11528 unsigned Opc;
11529
11530 if (isa<ConstantSDNode>(BarOp)) {
11531 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11532 switch (IntrinsicID) {
11533 default:
11534 return SDValue();
11535 case Intrinsic::amdgcn_s_barrier_join:
11536 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11537 break;
11538 case Intrinsic::amdgcn_s_wakeup_barrier:
11539 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11540 break;
11541 }
11542 // extract the BarrierID from bits 4-9 of the immediate
11543 unsigned BarID = (BarVal >> 4) & 0x3F;
11544 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11545 Ops.push_back(K);
11546 Ops.push_back(Chain);
11547 } else {
11548 switch (IntrinsicID) {
11549 default:
11550 return SDValue();
11551 case Intrinsic::amdgcn_s_barrier_join:
11552 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11553 break;
11554 case Intrinsic::amdgcn_s_wakeup_barrier:
11555 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11556 break;
11557 }
11558 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11559 SDValue M0Val;
11560 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11561 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11562 M0Val =
11563 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11564 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11565 0);
11566 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11567 }
11568
11569 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11570 return SDValue(NewMI, 0);
11571 }
11572 case Intrinsic::amdgcn_s_prefetch_data: {
11573 // For non-global address space preserve the chain and remove the call.
11575 return Op.getOperand(0);
11576 return Op;
11577 }
11578 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11579 SDValue Ops[] = {
11580 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11581 Op.getOperand(3), // offset
11582 Op.getOperand(4), // length
11583 };
11584
11585 MemSDNode *M = cast<MemSDNode>(Op);
11586 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
11587 Op->getVTList(), Ops, M->getMemoryVT(),
11588 M->getMemOperand());
11589 }
11590 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11591 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11592 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11593 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11594 SDValue Chain = Op->getOperand(0);
11595 SDValue Ptr = Op->getOperand(2);
11596 SDValue Val = Op->getOperand(3);
11597 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11598 Ptr, MII->getMemOperand());
11599 }
11600 default: {
11601 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11603 return lowerImage(Op, ImageDimIntr, DAG, true);
11604
11605 return Op;
11606 }
11607 }
11608}
11609
11610// Return whether the operation has NoUnsignedWrap property.
11611static bool isNoUnsignedWrap(SDValue Addr) {
11612 return (Addr.getOpcode() == ISD::ADD &&
11613 Addr->getFlags().hasNoUnsignedWrap()) ||
11614 Addr->getOpcode() == ISD::OR;
11615}
11616
11618 EVT PtrVT) const {
11619 return PtrVT == MVT::i64;
11620}
11621
11623 EVT PtrVT) const {
11624 return true;
11625}
11626
11627// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11628// offset (the offset that is included in bounds checking and swizzling, to be
11629// split between the instruction's voffset and immoffset fields) and soffset
11630// (the offset that is excluded from bounds checking and swizzling, to go in
11631// the instruction's soffset field). This function takes the first kind of
11632// offset and figures out how to split it between voffset and immoffset.
11633std::pair<SDValue, SDValue>
11634SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11635 SDLoc DL(Offset);
11636 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11637 SDValue N0 = Offset;
11638 ConstantSDNode *C1 = nullptr;
11639
11640 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11641 N0 = SDValue();
11642 else if (DAG.isBaseWithConstantOffset(N0)) {
11643 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11644 // being added, so we can only safely match a 32-bit addition with no
11645 // unsigned overflow.
11646 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11647 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11648 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11649 N0 = N0.getOperand(0);
11650 }
11651 }
11652
11653 if (C1) {
11654 unsigned ImmOffset = C1->getZExtValue();
11655 // If the immediate value is too big for the immoffset field, put only bits
11656 // that would normally fit in the immoffset field. The remaining value that
11657 // is copied/added for the voffset field is a large power of 2, and it
11658 // stands more chance of being CSEd with the copy/add for another similar
11659 // load/store.
11660 // However, do not do that rounding down if that is a negative
11661 // number, as it appears to be illegal to have a negative offset in the
11662 // vgpr, even if adding the immediate offset makes it positive.
11663 unsigned Overflow = ImmOffset & ~MaxImm;
11664 ImmOffset -= Overflow;
11665 if ((int32_t)Overflow < 0) {
11666 Overflow += ImmOffset;
11667 ImmOffset = 0;
11668 }
11669 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11670 if (Overflow) {
11671 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11672 if (!N0)
11673 N0 = OverflowVal;
11674 else {
11675 SDValue Ops[] = {N0, OverflowVal};
11676 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11677 }
11678 }
11679 }
11680 if (!N0)
11681 N0 = DAG.getConstant(0, DL, MVT::i32);
11682 if (!C1)
11683 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11684 return {N0, SDValue(C1, 0)};
11685}
11686
11687// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11688// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11689// pointed to by Offsets.
11690void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11691 SelectionDAG &DAG, SDValue *Offsets,
11692 Align Alignment) const {
11693 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11694 SDLoc DL(CombinedOffset);
11695 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11696 uint32_t Imm = C->getZExtValue();
11697 uint32_t SOffset, ImmOffset;
11698 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11699 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11700 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11701 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11702 return;
11703 }
11704 }
11705 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11706 SDValue N0 = CombinedOffset.getOperand(0);
11707 SDValue N1 = CombinedOffset.getOperand(1);
11708 uint32_t SOffset, ImmOffset;
11709 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11710 if (Offset >= 0 &&
11711 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11712 Offsets[0] = N0;
11713 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11714 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11715 return;
11716 }
11717 }
11718
11719 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11720 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11721 : DAG.getConstant(0, DL, MVT::i32);
11722
11723 Offsets[0] = CombinedOffset;
11724 Offsets[1] = SOffsetZero;
11725 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11726}
11727
11728SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11729 SelectionDAG &DAG) const {
11730 if (!MaybePointer.getValueType().isScalarInteger())
11731 return MaybePointer;
11732
11733 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11734 return Rsrc;
11735}
11736
11737// Wrap a global or flat pointer into a buffer intrinsic using the flags
11738// specified in the intrinsic.
11739SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11740 SelectionDAG &DAG) const {
11741 SDLoc Loc(Op);
11742
11743 SDValue Pointer = Op->getOperand(1);
11744 SDValue Stride = Op->getOperand(2);
11745 SDValue NumRecords = Op->getOperand(3);
11746 SDValue Flags = Op->getOperand(4);
11747
11748 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11749 SDValue Rsrc;
11750
11751 if (Subtarget->has45BitNumRecordsBufferResource()) {
11752 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11753 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11754 // num_records.
11755 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11756 SDValue NumRecordsLHS =
11757 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11758 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11759 SDValue LowHalf =
11760 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11761
11762 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11763 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11764 SDValue NumRecordsRHS =
11765 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11766 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11767 SDValue ShiftedStride =
11768 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11769 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11770 SDValue ExtShiftedStrideVec =
11771 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11772 SDValue ExtShiftedStride =
11773 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11774 SDValue ShiftedFlags =
11775 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11776 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11777 SDValue ExtShiftedFlagsVec =
11778 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11779 SDValue ExtShiftedFlags =
11780 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11781 SDValue CombinedFields =
11782 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11783 SDValue HighHalf =
11784 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11785
11786 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11787 } else {
11788 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11789 auto [LowHalf, HighHalf] =
11790 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11791 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11792 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11793 SDValue ShiftedStride =
11794 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11795 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11796 SDValue NewHighHalf =
11797 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11798
11799 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11800 NumRecords, Flags);
11801 }
11802
11803 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11804 return RsrcPtr;
11805}
11806
11807// Handle 8 bit and 16 bit buffer loads
11808SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11809 EVT LoadVT, SDLoc DL,
11811 MachineMemOperand *MMO,
11812 bool IsTFE) const {
11813 EVT IntVT = LoadVT.changeTypeToInteger();
11814
11815 if (IsTFE) {
11816 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11817 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11818 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11819 MachineFunction &MF = DAG.getMachineFunction();
11820 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11821 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11822 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11823 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11824 DAG.getConstant(1, DL, MVT::i32));
11825 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11826 DAG.getConstant(0, DL, MVT::i32));
11827 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11828 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11829 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11830 }
11831
11832 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11833 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11834 : AMDGPUISD::BUFFER_LOAD_USHORT;
11835
11836 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11837 SDValue BufferLoad =
11838 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11839 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11840 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11841
11842 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11843}
11844
11845// Handle 8 bit and 16 bit buffer stores
11846SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11847 EVT VDataType, SDLoc DL,
11848 SDValue Ops[],
11849 MemSDNode *M) const {
11850 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11851 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11852
11853 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11854 Ops[1] = BufferStoreExt;
11855 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11856 : AMDGPUISD::BUFFER_STORE_SHORT;
11857 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11858 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11859 M->getMemOperand());
11860}
11861
11863 SDValue Op, const SDLoc &SL, EVT VT) {
11864 if (VT.bitsLT(Op.getValueType()))
11865 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11866
11867 switch (ExtType) {
11868 case ISD::SEXTLOAD:
11869 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11870 case ISD::ZEXTLOAD:
11871 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11872 case ISD::EXTLOAD:
11873 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11874 case ISD::NON_EXTLOAD:
11875 return Op;
11876 }
11877
11878 llvm_unreachable("invalid ext type");
11879}
11880
11881// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11882// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11883SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11884 DAGCombinerInfo &DCI) const {
11885 SelectionDAG &DAG = DCI.DAG;
11886 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11887 return SDValue();
11888
11889 // FIXME: Constant loads should all be marked invariant.
11890 unsigned AS = Ld->getAddressSpace();
11891 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11893 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11894 return SDValue();
11895
11896 // Don't do this early, since it may interfere with adjacent load merging for
11897 // illegal types. We can avoid losing alignment information for exotic types
11898 // pre-legalize.
11899 EVT MemVT = Ld->getMemoryVT();
11900 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11901 MemVT.getSizeInBits() >= 32)
11902 return SDValue();
11903
11904 SDLoc SL(Ld);
11905
11906 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11907 "unexpected vector extload");
11908
11909 // TODO: Drop only high part of range.
11910 SDValue Ptr = Ld->getBasePtr();
11911 SDValue NewLoad = DAG.getLoad(
11912 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11913 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11914 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11915 nullptr); // Drop ranges
11916
11917 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11918 if (MemVT.isFloatingPoint()) {
11920 "unexpected fp extload");
11921 TruncVT = MemVT.changeTypeToInteger();
11922 }
11923
11924 SDValue Cvt = NewLoad;
11925 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11926 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11927 DAG.getValueType(TruncVT));
11928 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11930 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11931 } else {
11933 }
11934
11935 EVT VT = Ld->getValueType(0);
11936 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11937
11938 DCI.AddToWorklist(Cvt.getNode());
11939
11940 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11941 // the appropriate extension from the 32-bit load.
11942 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11943 DCI.AddToWorklist(Cvt.getNode());
11944
11945 // Handle conversion back to floating point if necessary.
11946 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11947
11948 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11949}
11950
11952 const SIMachineFunctionInfo &Info) {
11953 // TODO: Should check if the address can definitely not access stack.
11954 if (Info.isEntryFunction())
11955 return Info.getUserSGPRInfo().hasFlatScratchInit();
11956 return true;
11957}
11958
11959SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11960 SDLoc DL(Op);
11961 LoadSDNode *Load = cast<LoadSDNode>(Op);
11962 ISD::LoadExtType ExtType = Load->getExtensionType();
11963 EVT MemVT = Load->getMemoryVT();
11964 MachineMemOperand *MMO = Load->getMemOperand();
11965
11966 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11967 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11968 return SDValue();
11969
11970 // FIXME: Copied from PPC
11971 // First, load into 32 bits, then truncate to 1 bit.
11972
11973 SDValue Chain = Load->getChain();
11974 SDValue BasePtr = Load->getBasePtr();
11975
11976 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11977
11978 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11979 RealMemVT, MMO);
11980
11981 if (!MemVT.isVector()) {
11982 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11983 NewLD.getValue(1)};
11984
11985 return DAG.getMergeValues(Ops, DL);
11986 }
11987
11989 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11990 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11991 DAG.getConstant(I, DL, MVT::i32));
11992
11993 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11994 }
11995
11996 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11997
11998 return DAG.getMergeValues(Ops, DL);
11999 }
12000
12001 if (!MemVT.isVector())
12002 return SDValue();
12003
12004 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12005 "Custom lowering for non-i32 vectors hasn't been implemented.");
12006
12007 Align Alignment = Load->getAlign();
12008 unsigned AS = Load->getAddressSpace();
12009 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12010 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12011 return SplitVectorLoad(Op, DAG);
12012 }
12013
12014 MachineFunction &MF = DAG.getMachineFunction();
12015 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12016 // If there is a possibility that flat instruction access scratch memory
12017 // then we need to use the same legalization rules we use for private.
12018 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12019 !Subtarget->hasMultiDwordFlatScratchAddressing())
12020 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12023
12024 unsigned NumElements = MemVT.getVectorNumElements();
12025
12026 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12028 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12029 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12030 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12031 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12032 Alignment >= Align(4) && NumElements < 32) {
12033 if (MemVT.isPow2VectorType() ||
12034 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12035 return SDValue();
12036 return WidenOrSplitVectorLoad(Op, DAG);
12037 }
12038 // Non-uniform loads will be selected to MUBUF instructions, so they
12039 // have the same legalization requirements as global and private
12040 // loads.
12041 //
12042 }
12043 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12046 if (NumElements > 4)
12047 return SplitVectorLoad(Op, DAG);
12048 // v3 loads not supported on SI.
12049 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12050 return WidenOrSplitVectorLoad(Op, DAG);
12051
12052 // v3 and v4 loads are supported for private and global memory.
12053 return SDValue();
12054 }
12055 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12056 // Depending on the setting of the private_element_size field in the
12057 // resource descriptor, we can only make private accesses up to a certain
12058 // size.
12059 switch (Subtarget->getMaxPrivateElementSize()) {
12060 case 4: {
12061 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12062 return DAG.getMergeValues({Op0, Op1}, DL);
12063 }
12064 case 8:
12065 if (NumElements > 2)
12066 return SplitVectorLoad(Op, DAG);
12067 return SDValue();
12068 case 16:
12069 // Same as global/flat
12070 if (NumElements > 4)
12071 return SplitVectorLoad(Op, DAG);
12072 // v3 loads not supported on SI.
12073 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12074 return WidenOrSplitVectorLoad(Op, DAG);
12075
12076 return SDValue();
12077 default:
12078 llvm_unreachable("unsupported private_element_size");
12079 }
12080 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12081 unsigned Fast = 0;
12082 auto Flags = Load->getMemOperand()->getFlags();
12084 Load->getAlign(), Flags, &Fast) &&
12085 Fast > 1)
12086 return SDValue();
12087
12088 if (MemVT.isVector())
12089 return SplitVectorLoad(Op, DAG);
12090 }
12091
12093 MemVT, *Load->getMemOperand())) {
12094 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12095 return DAG.getMergeValues({Op0, Op1}, DL);
12096 }
12097
12098 return SDValue();
12099}
12100
12101SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12102 EVT VT = Op.getValueType();
12103 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12104 VT.getSizeInBits() == 512)
12105 return splitTernaryVectorOp(Op, DAG);
12106
12107 assert(VT.getSizeInBits() == 64);
12108
12109 SDLoc DL(Op);
12110 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12111
12112 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12113 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12114
12115 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12116 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12117
12118 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12119 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12120
12121 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12122
12123 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12124 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12125
12126 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12127
12128 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12129 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12130}
12131
12132// Catch division cases where we can use shortcuts with rcp and rsq
12133// instructions.
12134SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12135 SelectionDAG &DAG) const {
12136 SDLoc SL(Op);
12137 SDValue LHS = Op.getOperand(0);
12138 SDValue RHS = Op.getOperand(1);
12139 EVT VT = Op.getValueType();
12140 const SDNodeFlags Flags = Op->getFlags();
12141
12142 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12143
12144 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12145 // Without !fpmath accuracy information, we can't do more because we don't
12146 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12147 // f16 is always accurate enough
12148 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12149 return SDValue();
12150
12151 if (CLHS->isExactlyValue(1.0)) {
12152 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12153 // the CI documentation has a worst case error of 1 ulp.
12154 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12155 // use it as long as we aren't trying to use denormals.
12156 //
12157 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12158
12159 // 1.0 / sqrt(x) -> rsq(x)
12160
12161 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12162 // error seems really high at 2^29 ULP.
12163 // 1.0 / x -> rcp(x)
12164 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12165 }
12166
12167 // Same as for 1.0, but expand the sign out of the constant.
12168 if (CLHS->isExactlyValue(-1.0)) {
12169 // -1.0 / x -> rcp (fneg x)
12170 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12171 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12172 }
12173 }
12174
12175 // For f16 and bf16 require afn or arcp.
12176 // For f32 require afn.
12177 if (!AllowInaccurateRcp &&
12178 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12179 return SDValue();
12180
12181 // Turn into multiply by the reciprocal.
12182 // x / y -> x * (1.0 / y)
12183 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12184 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12185}
12186
12187SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12188 SelectionDAG &DAG) const {
12189 SDLoc SL(Op);
12190 SDValue X = Op.getOperand(0);
12191 SDValue Y = Op.getOperand(1);
12192 EVT VT = Op.getValueType();
12193 const SDNodeFlags Flags = Op->getFlags();
12194
12195 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12196 if (!AllowInaccurateDiv)
12197 return SDValue();
12198
12199 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12200 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12201
12202 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12203 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12204
12205 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12206 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12207 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12208 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12209 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12210 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12211}
12212
12213static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12214 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12215 SDNodeFlags Flags) {
12216 if (GlueChain->getNumValues() <= 1) {
12217 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12218 }
12219
12220 assert(GlueChain->getNumValues() == 3);
12221
12222 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12223 switch (Opcode) {
12224 default:
12225 llvm_unreachable("no chain equivalent for opcode");
12226 case ISD::FMUL:
12227 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12228 break;
12229 }
12230
12231 return DAG.getNode(Opcode, SL, VTList,
12232 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12233 Flags);
12234}
12235
12236static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12237 EVT VT, SDValue A, SDValue B, SDValue C,
12238 SDValue GlueChain, SDNodeFlags Flags) {
12239 if (GlueChain->getNumValues() <= 1) {
12240 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12241 }
12242
12243 assert(GlueChain->getNumValues() == 3);
12244
12245 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12246 switch (Opcode) {
12247 default:
12248 llvm_unreachable("no chain equivalent for opcode");
12249 case ISD::FMA:
12250 Opcode = AMDGPUISD::FMA_W_CHAIN;
12251 break;
12252 }
12253
12254 return DAG.getNode(Opcode, SL, VTList,
12255 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12256 Flags);
12257}
12258
12259SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12260 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12261 return FastLowered;
12262
12263 SDLoc SL(Op);
12264 EVT VT = Op.getValueType();
12265 SDValue LHS = Op.getOperand(0);
12266 SDValue RHS = Op.getOperand(1);
12267
12268 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12269 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12270
12271 if (VT == MVT::bf16) {
12272 SDValue ExtDiv =
12273 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12274 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12275 DAG.getTargetConstant(0, SL, MVT::i32));
12276 }
12277
12278 assert(VT == MVT::f16);
12279
12280 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12281 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12282 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12283 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12284 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12285 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12286 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12287 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12288 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12289 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12290 // q16.u = opx(V_CVT_F16_F32, q32.u);
12291 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12292
12293 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12294 unsigned FMADOpCode =
12296 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12297 SDValue Rcp =
12298 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12299 SDValue Quot =
12300 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12301 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12302 Op->getFlags());
12303 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12304 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12305 Op->getFlags());
12306 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12307 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12308 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12309 DAG.getConstant(0xff800000, SL, MVT::i32));
12310 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12311 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12312 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12313 DAG.getTargetConstant(0, SL, MVT::i32));
12314 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12315 Op->getFlags());
12316}
12317
12318// Faster 2.5 ULP division that does not support denormals.
12319SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12320 SDNodeFlags Flags = Op->getFlags();
12321 SDLoc SL(Op);
12322 SDValue LHS = Op.getOperand(1);
12323 SDValue RHS = Op.getOperand(2);
12324
12325 // TODO: The combiner should probably handle elimination of redundant fabs.
12327 ? RHS
12328 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12329
12330 const APFloat K0Val(0x1p+96f);
12331 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12332
12333 const APFloat K1Val(0x1p-32f);
12334 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12335
12336 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12337
12338 EVT SetCCVT =
12339 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12340
12341 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12342
12343 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12344
12345 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12346
12347 // rcp does not support denormals.
12348 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12349
12350 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12351
12352 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12353}
12354
12355// Returns immediate value for setting the F32 denorm mode when using the
12356// S_DENORM_MODE instruction.
12359 const GCNSubtarget *ST) {
12360 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12361 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12362 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12363 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12364}
12365
12366SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12367 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12368 return FastLowered;
12369
12370 // The selection matcher assumes anything with a chain selecting to a
12371 // mayRaiseFPException machine instruction. Since we're introducing a chain
12372 // here, we need to explicitly report nofpexcept for the regular fdiv
12373 // lowering.
12374 SDNodeFlags Flags = Op->getFlags();
12375 Flags.setNoFPExcept(true);
12376
12377 SDLoc SL(Op);
12378 SDValue LHS = Op.getOperand(0);
12379 SDValue RHS = Op.getOperand(1);
12380
12381 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12382
12383 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12384
12385 SDValue DenominatorScaled =
12386 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12387 SDValue NumeratorScaled =
12388 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12389
12390 // Denominator is scaled to not be denormal, so using rcp is ok.
12391 SDValue ApproxRcp =
12392 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12393 SDValue NegDivScale0 =
12394 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12395
12396 using namespace AMDGPU::Hwreg;
12397 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12398 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12399
12400 const MachineFunction &MF = DAG.getMachineFunction();
12401 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12402 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12403
12404 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12405 const bool HasDynamicDenormals =
12406 (DenormMode.Input == DenormalMode::Dynamic) ||
12407 (DenormMode.Output == DenormalMode::Dynamic);
12408
12409 SDValue SavedDenormMode;
12410
12411 if (!PreservesDenormals) {
12412 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12413 // lowering. The chain dependence is insufficient, and we need glue. We do
12414 // not need the glue variants in a strictfp function.
12415
12416 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12417
12418 SDValue Glue = DAG.getEntryNode();
12419 if (HasDynamicDenormals) {
12420 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12421 DAG.getVTList(MVT::i32, MVT::Glue),
12422 {BitField, Glue});
12423 SavedDenormMode = SDValue(GetReg, 0);
12424
12425 Glue = DAG.getMergeValues(
12426 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12427 }
12428
12429 SDNode *EnableDenorm;
12430 if (Subtarget->hasDenormModeInst()) {
12431 const SDValue EnableDenormValue =
12433
12434 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12435 EnableDenormValue)
12436 .getNode();
12437 } else {
12438 const SDValue EnableDenormValue =
12439 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12440 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12441 {EnableDenormValue, BitField, Glue});
12442 }
12443
12444 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12445 SDValue(EnableDenorm, 1)};
12446
12447 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12448 }
12449
12450 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12451 ApproxRcp, One, NegDivScale0, Flags);
12452
12453 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12454 ApproxRcp, Fma0, Flags);
12455
12456 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12457 Fma1, Flags);
12458
12459 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12460 NumeratorScaled, Mul, Flags);
12461
12462 SDValue Fma3 =
12463 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12464
12465 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12466 NumeratorScaled, Fma3, Flags);
12467
12468 if (!PreservesDenormals) {
12469 SDNode *DisableDenorm;
12470 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12471 const SDValue DisableDenormValue = getSPDenormModeValue(
12472 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12473
12474 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12475 DisableDenorm =
12476 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12477 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12478 .getNode();
12479 } else {
12480 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12481 const SDValue DisableDenormValue =
12482 HasDynamicDenormals
12483 ? SavedDenormMode
12484 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12485
12486 DisableDenorm = DAG.getMachineNode(
12487 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12488 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12489 }
12490
12491 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12492 SDValue(DisableDenorm, 0), DAG.getRoot());
12493 DAG.setRoot(OutputChain);
12494 }
12495
12496 SDValue Scale = NumeratorScaled.getValue(1);
12497 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12498 {Fma4, Fma1, Fma3, Scale}, Flags);
12499
12500 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12501}
12502
12503SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12504 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12505 return FastLowered;
12506
12507 SDLoc SL(Op);
12508 SDValue X = Op.getOperand(0);
12509 SDValue Y = Op.getOperand(1);
12510
12511 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12512
12513 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12514
12515 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12516
12517 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12518
12519 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12520
12521 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12522
12523 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12524
12525 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12526
12527 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12528
12529 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12530 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12531
12532 SDValue Fma4 =
12533 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12534
12535 SDValue Scale;
12536
12537 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12538 // Workaround a hardware bug on SI where the condition output from div_scale
12539 // is not usable.
12540
12541 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12542
12543 // Figure out if the scale to use for div_fmas.
12544 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12545 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12546 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12547 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12548
12549 SDValue NumHi =
12550 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12551 SDValue DenHi =
12552 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12553
12554 SDValue Scale0Hi =
12555 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12556 SDValue Scale1Hi =
12557 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12558
12559 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12560 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12561 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12562 } else {
12563 Scale = DivScale1.getValue(1);
12564 }
12565
12566 SDValue Fmas =
12567 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12568
12569 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12570}
12571
12572SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12573 EVT VT = Op.getValueType();
12574
12575 if (VT == MVT::f32)
12576 return LowerFDIV32(Op, DAG);
12577
12578 if (VT == MVT::f64)
12579 return LowerFDIV64(Op, DAG);
12580
12581 if (VT == MVT::f16 || VT == MVT::bf16)
12582 return LowerFDIV16(Op, DAG);
12583
12584 llvm_unreachable("Unexpected type for fdiv");
12585}
12586
12587SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12588 SDLoc dl(Op);
12589 SDValue Val = Op.getOperand(0);
12590 EVT VT = Val.getValueType();
12591 EVT ResultExpVT = Op->getValueType(1);
12592 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12593
12594 SDValue Mant = DAG.getNode(
12596 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12597
12598 SDValue Exp = DAG.getNode(
12599 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12600 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12601
12602 if (Subtarget->hasFractBug()) {
12603 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12604 SDValue Inf =
12606
12607 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12608 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12609 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12610 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12611 }
12612
12613 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12614 return DAG.getMergeValues({Mant, CastExp}, dl);
12615}
12616
12617SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12618 SDLoc DL(Op);
12619 StoreSDNode *Store = cast<StoreSDNode>(Op);
12620 EVT VT = Store->getMemoryVT();
12621
12622 if (VT == MVT::i1) {
12623 return DAG.getTruncStore(
12624 Store->getChain(), DL,
12625 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12626 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12627 }
12628
12629 assert(VT.isVector() &&
12630 Store->getValue().getValueType().getScalarType() == MVT::i32);
12631
12632 unsigned AS = Store->getAddressSpace();
12633 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12634 Store->getAlign().value() < VT.getStoreSize() &&
12635 VT.getSizeInBits() > 32) {
12636 return SplitVectorStore(Op, DAG);
12637 }
12638
12639 MachineFunction &MF = DAG.getMachineFunction();
12640 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12641 // If there is a possibility that flat instruction access scratch memory
12642 // then we need to use the same legalization rules we use for private.
12643 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12644 !Subtarget->hasMultiDwordFlatScratchAddressing())
12645 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12648
12649 unsigned NumElements = VT.getVectorNumElements();
12651 if (NumElements > 4)
12652 return SplitVectorStore(Op, DAG);
12653 // v3 stores not supported on SI.
12654 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12655 return SplitVectorStore(Op, DAG);
12656
12658 VT, *Store->getMemOperand()))
12659 return expandUnalignedStore(Store, DAG);
12660
12661 return SDValue();
12662 }
12663 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12664 switch (Subtarget->getMaxPrivateElementSize()) {
12665 case 4:
12666 return scalarizeVectorStore(Store, DAG);
12667 case 8:
12668 if (NumElements > 2)
12669 return SplitVectorStore(Op, DAG);
12670 return SDValue();
12671 case 16:
12672 if (NumElements > 4 ||
12673 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12674 return SplitVectorStore(Op, DAG);
12675 return SDValue();
12676 default:
12677 llvm_unreachable("unsupported private_element_size");
12678 }
12679 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12680 unsigned Fast = 0;
12681 auto Flags = Store->getMemOperand()->getFlags();
12683 Store->getAlign(), Flags, &Fast) &&
12684 Fast > 1)
12685 return SDValue();
12686
12687 if (VT.isVector())
12688 return SplitVectorStore(Op, DAG);
12689
12690 return expandUnalignedStore(Store, DAG);
12691 }
12692
12693 // Probably an invalid store. If so we'll end up emitting a selection error.
12694 return SDValue();
12695}
12696
12697// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12698SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12699 SDLoc SL(Op);
12700 assert(!Subtarget->has16BitInsts());
12701 SDNodeFlags Flags = Op->getFlags();
12702 SDValue Ext =
12703 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12704
12705 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12706 SDValue Sqrt =
12707 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12708
12709 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12710 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12711}
12712
12713SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12714 SDLoc DL(Op);
12715 SDNodeFlags Flags = Op->getFlags();
12716 MVT VT = Op.getValueType().getSimpleVT();
12717 const SDValue X = Op.getOperand(0);
12718
12719 if (allowApproxFunc(DAG, Flags)) {
12720 // Instruction is 1ulp but ignores denormals.
12721 return DAG.getNode(
12723 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12724 }
12725
12726 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12727 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12728
12729 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12730
12731 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12732
12733 SDValue SqrtX =
12734 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12735
12736 SDValue SqrtS;
12737 if (needsDenormHandlingF32(DAG, X, Flags)) {
12738 SDValue SqrtID =
12739 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12740 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12741
12742 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12743 SDValue SqrtSNextDownInt =
12744 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12745 DAG.getAllOnesConstant(DL, MVT::i32));
12746 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12747
12748 SDValue NegSqrtSNextDown =
12749 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12750
12751 SDValue SqrtVP =
12752 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12753
12754 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12755 DAG.getConstant(1, DL, MVT::i32));
12756 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12757
12758 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12759 SDValue SqrtVS =
12760 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12761
12762 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12763 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12764
12765 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12766 Flags);
12767
12768 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12769 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12770 Flags);
12771 } else {
12772 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12773
12774 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12775
12776 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12777 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12778 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12779
12780 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12781 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12782 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12783
12784 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12785 SDValue SqrtD =
12786 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12787 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12788 }
12789
12790 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12791
12792 SDValue ScaledDown =
12793 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12794
12795 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12796 SDValue IsZeroOrInf =
12797 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12798 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12799
12800 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12801}
12802
12803SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12804 // For double type, the SQRT and RSQ instructions don't have required
12805 // precision, we apply Goldschmidt's algorithm to improve the result:
12806 //
12807 // y0 = rsq(x)
12808 // g0 = x * y0
12809 // h0 = 0.5 * y0
12810 //
12811 // r0 = 0.5 - h0 * g0
12812 // g1 = g0 * r0 + g0
12813 // h1 = h0 * r0 + h0
12814 //
12815 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12816 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12817 // h2 = h1 * r1 + h1
12818 //
12819 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12820 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12821 //
12822 // sqrt(x) = g3
12823
12824 SDNodeFlags Flags = Op->getFlags();
12825
12826 SDLoc DL(Op);
12827
12828 SDValue X = Op.getOperand(0);
12829 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12830
12831 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12832
12833 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12834
12835 // Scale up input if it is too small.
12836 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12837 SDValue ScaleUp =
12838 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12839 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12840
12841 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12842
12843 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12844
12845 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12846 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12847
12848 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12849 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12850
12851 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12852
12853 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12854
12855 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12856 SDValue SqrtD0 =
12857 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12858
12859 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12860
12861 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12862 SDValue SqrtD1 =
12863 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12864
12865 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12866
12867 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12868 SDValue ScaleDown =
12869 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12870 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12871
12872 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12873 // with finite only or nsz because rsq(+/-0) = +/-inf
12874
12875 // TODO: Check for DAZ and expand to subnormals
12876 SDValue IsZeroOrInf =
12877 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12878 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12879
12880 // If x is +INF, +0, or -0, use its original value
12881 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12882 Flags);
12883}
12884
12885SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12886 SDLoc DL(Op);
12887 EVT VT = Op.getValueType();
12888 SDValue Arg = Op.getOperand(0);
12889 SDValue TrigVal;
12890
12891 // Propagate fast-math flags so that the multiply we introduce can be folded
12892 // if Arg is already the result of a multiply by constant.
12893 auto Flags = Op->getFlags();
12894
12895 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12896
12897 if (Subtarget->hasTrigReducedRange()) {
12898 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12899 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12900 } else {
12901 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12902 }
12903
12904 switch (Op.getOpcode()) {
12905 case ISD::FCOS:
12906 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12907 case ISD::FSIN:
12908 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12909 default:
12910 llvm_unreachable("Wrong trig opcode");
12911 }
12912}
12913
12914SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12915 SelectionDAG &DAG) const {
12916 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12917 assert(AtomicNode->isCompareAndSwap());
12918 unsigned AS = AtomicNode->getAddressSpace();
12919
12920 // No custom lowering required for local address space
12922 return Op;
12923
12924 // Non-local address space requires custom lowering for atomic compare
12925 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12926 SDLoc DL(Op);
12927 SDValue ChainIn = Op.getOperand(0);
12928 SDValue Addr = Op.getOperand(1);
12929 SDValue Old = Op.getOperand(2);
12930 SDValue New = Op.getOperand(3);
12931 EVT VT = Op.getValueType();
12932 MVT SimpleVT = VT.getSimpleVT();
12933 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12934
12935 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12936 SDValue Ops[] = {ChainIn, Addr, NewOld};
12937
12938 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
12939 Op->getVTList(), Ops, VT,
12940 AtomicNode->getMemOperand());
12941}
12942
12943//===----------------------------------------------------------------------===//
12944// Custom DAG optimizations
12945//===----------------------------------------------------------------------===//
12946
12947SDValue
12948SITargetLowering::performUCharToFloatCombine(SDNode *N,
12949 DAGCombinerInfo &DCI) const {
12950 EVT VT = N->getValueType(0);
12951 EVT ScalarVT = VT.getScalarType();
12952 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12953 return SDValue();
12954
12955 SelectionDAG &DAG = DCI.DAG;
12956 SDLoc DL(N);
12957
12958 SDValue Src = N->getOperand(0);
12959 EVT SrcVT = Src.getValueType();
12960
12961 // TODO: We could try to match extracting the higher bytes, which would be
12962 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12963 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12964 // about in practice.
12965 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12966 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12967 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12968 DCI.AddToWorklist(Cvt.getNode());
12969
12970 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12971 if (ScalarVT != MVT::f32) {
12972 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12973 DAG.getTargetConstant(0, DL, MVT::i32));
12974 }
12975 return Cvt;
12976 }
12977 }
12978
12979 return SDValue();
12980}
12981
12982SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12983 DAGCombinerInfo &DCI) const {
12984 SDValue MagnitudeOp = N->getOperand(0);
12985 SDValue SignOp = N->getOperand(1);
12986
12987 // The generic combine for fcopysign + fp cast is too conservative with
12988 // vectors, and also gets confused by the splitting we will perform here, so
12989 // peek through FP casts.
12990 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12991 SignOp.getOpcode() == ISD::FP_ROUND)
12992 SignOp = SignOp.getOperand(0);
12993
12994 SelectionDAG &DAG = DCI.DAG;
12995 SDLoc DL(N);
12996 EVT SignVT = SignOp.getValueType();
12997
12998 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12999 // lower half with a copy.
13000 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13001 EVT MagVT = MagnitudeOp.getValueType();
13002
13003 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13004
13005 if (MagVT.getScalarType() == MVT::f64) {
13006 EVT F32VT = MagVT.isVector()
13007 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13008 : MVT::v2f32;
13009
13010 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13011
13013 for (unsigned I = 0; I != NumElts; ++I) {
13014 SDValue MagLo =
13015 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13016 DAG.getConstant(2 * I, DL, MVT::i32));
13017 SDValue MagHi =
13018 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13019 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13020
13021 SDValue SignOpElt =
13022 MagVT.isVector()
13024 SignOp, DAG.getConstant(I, DL, MVT::i32))
13025 : SignOp;
13026
13027 SDValue HiOp =
13028 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13029
13030 SDValue Vector =
13031 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13032
13033 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13034 NewElts.push_back(NewElt);
13035 }
13036
13037 if (NewElts.size() == 1)
13038 return NewElts[0];
13039
13040 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13041 }
13042
13043 if (SignVT.getScalarType() != MVT::f64)
13044 return SDValue();
13045
13046 // Reduce width of sign operand, we only need the highest bit.
13047 //
13048 // fcopysign f64:x, f64:y ->
13049 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13050 // TODO: In some cases it might make sense to go all the way to f16.
13051
13052 EVT F32VT = MagVT.isVector()
13053 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13054 : MVT::v2f32;
13055
13056 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13057
13058 SmallVector<SDValue, 8> F32Signs;
13059 for (unsigned I = 0; I != NumElts; ++I) {
13060 // Take sign from odd elements of cast vector
13061 SDValue SignAsF32 =
13062 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13063 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13064 F32Signs.push_back(SignAsF32);
13065 }
13066
13067 SDValue NewSign =
13068 NumElts == 1
13069 ? F32Signs.back()
13071 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13072 F32Signs);
13073
13074 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13075 NewSign);
13076}
13077
13078// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13079// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13080// bits
13081
13082// This is a variant of
13083// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13084//
13085// The normal DAG combiner will do this, but only if the add has one use since
13086// that would increase the number of instructions.
13087//
13088// This prevents us from seeing a constant offset that can be folded into a
13089// memory instruction's addressing mode. If we know the resulting add offset of
13090// a pointer can be folded into an addressing offset, we can replace the pointer
13091// operand with the add of new constant offset. This eliminates one of the uses,
13092// and may allow the remaining use to also be simplified.
13093//
13094SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13095 EVT MemVT,
13096 DAGCombinerInfo &DCI) const {
13097 SDValue N0 = N->getOperand(0);
13098 SDValue N1 = N->getOperand(1);
13099
13100 // We only do this to handle cases where it's profitable when there are
13101 // multiple uses of the add, so defer to the standard combine.
13102 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13103 return SDValue();
13104
13105 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13106 if (!CN1)
13107 return SDValue();
13108
13109 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13110 if (!CAdd)
13111 return SDValue();
13112
13113 SelectionDAG &DAG = DCI.DAG;
13114
13115 if (N0->getOpcode() == ISD::OR &&
13116 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13117 return SDValue();
13118
13119 // If the resulting offset is too large, we can't fold it into the
13120 // addressing mode offset.
13121 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13122 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13123
13124 AddrMode AM;
13125 AM.HasBaseReg = true;
13126 AM.BaseOffs = Offset.getSExtValue();
13127 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13128 return SDValue();
13129
13130 SDLoc SL(N);
13131 EVT VT = N->getValueType(0);
13132
13133 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13134 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13135
13136 SDNodeFlags Flags;
13137 Flags.setNoUnsignedWrap(
13138 N->getFlags().hasNoUnsignedWrap() &&
13139 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13140
13141 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13142 // be sure that the new left operand is a proper base pointer.
13143 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13144}
13145
13146/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13147/// by the chain and intrinsic ID. Theoretically we would also need to check the
13148/// specific intrinsic, but they all place the pointer operand first.
13149static unsigned getBasePtrIndex(const MemSDNode *N) {
13150 switch (N->getOpcode()) {
13151 case ISD::STORE:
13154 return 2;
13155 default:
13156 return 1;
13157 }
13158}
13159
13160SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13161 DAGCombinerInfo &DCI) const {
13162 SelectionDAG &DAG = DCI.DAG;
13163
13164 unsigned PtrIdx = getBasePtrIndex(N);
13165 SDValue Ptr = N->getOperand(PtrIdx);
13166
13167 // TODO: We could also do this for multiplies.
13168 if (Ptr.getOpcode() == ISD::SHL) {
13169 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13170 N->getMemoryVT(), DCI);
13171 if (NewPtr) {
13172 SmallVector<SDValue, 8> NewOps(N->ops());
13173
13174 NewOps[PtrIdx] = NewPtr;
13175 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13176 }
13177 }
13178
13179 return SDValue();
13180}
13181
13182static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13183 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13184 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13185 (Opc == ISD::XOR && Val == 0);
13186}
13187
13188// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13189// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13190// integer combine opportunities since most 64-bit operations are decomposed
13191// this way. TODO: We won't want this for SALU especially if it is an inline
13192// immediate.
13193SDValue SITargetLowering::splitBinaryBitConstantOp(
13194 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13195 const ConstantSDNode *CRHS) const {
13196 uint64_t Val = CRHS->getZExtValue();
13197 uint32_t ValLo = Lo_32(Val);
13198 uint32_t ValHi = Hi_32(Val);
13199 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13200
13201 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13203 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13204 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13205 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13206 !CRHS->user_begin()->isDivergent())
13207 return SDValue();
13208
13209 // If we need to materialize a 64-bit immediate, it will be split up later
13210 // anyway. Avoid creating the harder to understand 64-bit immediate
13211 // materialization.
13212 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13213 }
13214
13215 return SDValue();
13216}
13217
13219 if (V.getValueType() != MVT::i1)
13220 return false;
13221 switch (V.getOpcode()) {
13222 default:
13223 break;
13224 case ISD::SETCC:
13225 case ISD::IS_FPCLASS:
13226 case AMDGPUISD::FP_CLASS:
13227 return true;
13228 case ISD::AND:
13229 case ISD::OR:
13230 case ISD::XOR:
13231 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13232 case ISD::SADDO:
13233 case ISD::UADDO:
13234 case ISD::SSUBO:
13235 case ISD::USUBO:
13236 case ISD::SMULO:
13237 case ISD::UMULO:
13238 return V.getResNo() == 1;
13240 unsigned IntrinsicID = V.getConstantOperandVal(0);
13241 switch (IntrinsicID) {
13242 case Intrinsic::amdgcn_is_shared:
13243 case Intrinsic::amdgcn_is_private:
13244 return true;
13245 default:
13246 return false;
13247 }
13248
13249 return false;
13250 }
13251 }
13252 return false;
13253}
13254
13255// If a constant has all zeroes or all ones within each byte return it.
13256// Otherwise return 0.
13258 // 0xff for any zero byte in the mask
13259 uint32_t ZeroByteMask = 0;
13260 if (!(C & 0x000000ff))
13261 ZeroByteMask |= 0x000000ff;
13262 if (!(C & 0x0000ff00))
13263 ZeroByteMask |= 0x0000ff00;
13264 if (!(C & 0x00ff0000))
13265 ZeroByteMask |= 0x00ff0000;
13266 if (!(C & 0xff000000))
13267 ZeroByteMask |= 0xff000000;
13268 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13269 if ((NonZeroByteMask & C) != NonZeroByteMask)
13270 return 0; // Partial bytes selected.
13271 return C;
13272}
13273
13274// Check if a node selects whole bytes from its operand 0 starting at a byte
13275// boundary while masking the rest. Returns select mask as in the v_perm_b32
13276// or -1 if not succeeded.
13277// Note byte select encoding:
13278// value 0-3 selects corresponding source byte;
13279// value 0xc selects zero;
13280// value 0xff selects 0xff.
13282 assert(V.getValueSizeInBits() == 32);
13283
13284 if (V.getNumOperands() != 2)
13285 return ~0;
13286
13287 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13288 if (!N1)
13289 return ~0;
13290
13291 uint32_t C = N1->getZExtValue();
13292
13293 switch (V.getOpcode()) {
13294 default:
13295 break;
13296 case ISD::AND:
13297 if (uint32_t ConstMask = getConstantPermuteMask(C))
13298 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13299 break;
13300
13301 case ISD::OR:
13302 if (uint32_t ConstMask = getConstantPermuteMask(C))
13303 return (0x03020100 & ~ConstMask) | ConstMask;
13304 break;
13305
13306 case ISD::SHL:
13307 if (C % 8)
13308 return ~0;
13309
13310 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13311
13312 case ISD::SRL:
13313 if (C % 8)
13314 return ~0;
13315
13316 return uint32_t(0x0c0c0c0c03020100ull >> C);
13317 }
13318
13319 return ~0;
13320}
13321
13322SDValue SITargetLowering::performAndCombine(SDNode *N,
13323 DAGCombinerInfo &DCI) const {
13324 if (DCI.isBeforeLegalize())
13325 return SDValue();
13326
13327 SelectionDAG &DAG = DCI.DAG;
13328 EVT VT = N->getValueType(0);
13329 SDValue LHS = N->getOperand(0);
13330 SDValue RHS = N->getOperand(1);
13331
13332 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13333 if (VT == MVT::i64 && CRHS) {
13334 if (SDValue Split =
13335 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13336 return Split;
13337 }
13338
13339 if (CRHS && VT == MVT::i32) {
13340 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13341 // nb = number of trailing zeroes in mask
13342 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13343 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13344 uint64_t Mask = CRHS->getZExtValue();
13345 unsigned Bits = llvm::popcount(Mask);
13346 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13347 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13348 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13349 unsigned Shift = CShift->getZExtValue();
13350 unsigned NB = CRHS->getAPIntValue().countr_zero();
13351 unsigned Offset = NB + Shift;
13352 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13353 SDLoc SL(N);
13354 SDValue BFE =
13355 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13356 DAG.getConstant(Offset, SL, MVT::i32),
13357 DAG.getConstant(Bits, SL, MVT::i32));
13358 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13359 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13360 DAG.getValueType(NarrowVT));
13361 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13362 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13363 return Shl;
13364 }
13365 }
13366 }
13367
13368 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13369 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13370 isa<ConstantSDNode>(LHS.getOperand(2))) {
13371 uint32_t Sel = getConstantPermuteMask(Mask);
13372 if (!Sel)
13373 return SDValue();
13374
13375 // Select 0xc for all zero bytes
13376 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13377 SDLoc DL(N);
13378 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13379 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13380 }
13381 }
13382
13383 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13384 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13385 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13386 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13387 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13388
13389 SDValue X = LHS.getOperand(0);
13390 SDValue Y = RHS.getOperand(0);
13391 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13392 !isTypeLegal(X.getValueType()))
13393 return SDValue();
13394
13395 if (LCC == ISD::SETO) {
13396 if (X != LHS.getOperand(1))
13397 return SDValue();
13398
13399 if (RCC == ISD::SETUNE) {
13400 const ConstantFPSDNode *C1 =
13401 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13402 if (!C1 || !C1->isInfinity() || C1->isNegative())
13403 return SDValue();
13404
13405 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13409
13410 static_assert(
13413 0x3ff) == Mask,
13414 "mask not equal");
13415
13416 SDLoc DL(N);
13417 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13418 DAG.getConstant(Mask, DL, MVT::i32));
13419 }
13420 }
13421 }
13422
13423 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13424 std::swap(LHS, RHS);
13425
13426 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13427 RHS.hasOneUse()) {
13428 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13429 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13430 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13431 // | n_nan)
13432 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13433 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13434 (RHS.getOperand(0) == LHS.getOperand(0) &&
13435 LHS.getOperand(0) == LHS.getOperand(1))) {
13436 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13437 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13438 : Mask->getZExtValue() & OrdMask;
13439
13440 SDLoc DL(N);
13441 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13442 DAG.getConstant(NewMask, DL, MVT::i32));
13443 }
13444 }
13445
13446 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13447 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13448 // and x, (sext cc from i1) => select cc, x, 0
13449 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13450 std::swap(LHS, RHS);
13451 if (isBoolSGPR(RHS.getOperand(0)))
13452 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13453 DAG.getConstant(0, SDLoc(N), MVT::i32));
13454 }
13455
13456 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13457 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13458 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13459 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13460 uint32_t LHSMask = getPermuteMask(LHS);
13461 uint32_t RHSMask = getPermuteMask(RHS);
13462 if (LHSMask != ~0u && RHSMask != ~0u) {
13463 // Canonicalize the expression in an attempt to have fewer unique masks
13464 // and therefore fewer registers used to hold the masks.
13465 if (LHSMask > RHSMask) {
13466 std::swap(LHSMask, RHSMask);
13467 std::swap(LHS, RHS);
13468 }
13469
13470 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13471 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13472 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13473 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13474
13475 // Check of we need to combine values from two sources within a byte.
13476 if (!(LHSUsedLanes & RHSUsedLanes) &&
13477 // If we select high and lower word keep it for SDWA.
13478 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13479 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13480 // Each byte in each mask is either selector mask 0-3, or has higher
13481 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13482 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13483 // mask which is not 0xff wins. By anding both masks we have a correct
13484 // result except that 0x0c shall be corrected to give 0x0c only.
13485 uint32_t Mask = LHSMask & RHSMask;
13486 for (unsigned I = 0; I < 32; I += 8) {
13487 uint32_t ByteSel = 0xff << I;
13488 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13489 Mask &= (0x0c << I) & 0xffffffff;
13490 }
13491
13492 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13493 // or 0x0c.
13494 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13495 SDLoc DL(N);
13496
13497 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13498 RHS.getOperand(0),
13499 DAG.getConstant(Sel, DL, MVT::i32));
13500 }
13501 }
13502 }
13503
13504 return SDValue();
13505}
13506
13507// A key component of v_perm is a mapping between byte position of the src
13508// operands, and the byte position of the dest. To provide such, we need: 1. the
13509// node that provides x byte of the dest of the OR, and 2. the byte of the node
13510// used to provide that x byte. calculateByteProvider finds which node provides
13511// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13512// and finds an ultimate src and byte position For example: The supported
13513// LoadCombine pattern for vector loads is as follows
13514// t1
13515// or
13516// / \
13517// t2 t3
13518// zext shl
13519// | | \
13520// t4 t5 16
13521// or anyext
13522// / \ |
13523// t6 t7 t8
13524// srl shl or
13525// / | / \ / \
13526// t9 t10 t11 t12 t13 t14
13527// trunc* 8 trunc* 8 and and
13528// | | / | | \
13529// t15 t16 t17 t18 t19 t20
13530// trunc* 255 srl -256
13531// | / \
13532// t15 t15 16
13533//
13534// *In this example, the truncs are from i32->i16
13535//
13536// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13537// respectively. calculateSrcByte would find (given node) -> ultimate src &
13538// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13539// After finding the mapping, we can combine the tree into vperm t15, t16,
13540// 0x05000407
13541
13542// Find the source and byte position from a node.
13543// \p DestByte is the byte position of the dest of the or that the src
13544// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13545// dest of the or byte. \p Depth tracks how many recursive iterations we have
13546// performed.
13547static const std::optional<ByteProvider<SDValue>>
13548calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13549 unsigned Depth = 0) {
13550 // We may need to recursively traverse a series of SRLs
13551 if (Depth >= 6)
13552 return std::nullopt;
13553
13554 if (Op.getValueSizeInBits() < 8)
13555 return std::nullopt;
13556
13557 if (Op.getValueType().isVector())
13558 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13559
13560 switch (Op->getOpcode()) {
13561 case ISD::TRUNCATE: {
13562 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13563 }
13564
13565 case ISD::SIGN_EXTEND:
13566 case ISD::ZERO_EXTEND:
13568 SDValue NarrowOp = Op->getOperand(0);
13569 auto NarrowVT = NarrowOp.getValueType();
13570 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13571 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13572 NarrowVT = VTSign->getVT();
13573 }
13574 if (!NarrowVT.isByteSized())
13575 return std::nullopt;
13576 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13577
13578 if (SrcIndex >= NarrowByteWidth)
13579 return std::nullopt;
13580 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13581 }
13582
13583 case ISD::SRA:
13584 case ISD::SRL: {
13585 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13586 if (!ShiftOp)
13587 return std::nullopt;
13588
13589 uint64_t BitShift = ShiftOp->getZExtValue();
13590
13591 if (BitShift % 8 != 0)
13592 return std::nullopt;
13593
13594 SrcIndex += BitShift / 8;
13595
13596 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13597 }
13598
13599 default: {
13600 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13601 }
13602 }
13603 llvm_unreachable("fully handled switch");
13604}
13605
13606// For a byte position in the result of an Or, traverse the tree and find the
13607// node (and the byte of the node) which ultimately provides this {Or,
13608// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13609// the byte position of the Op that corresponds with the originally requested
13610// byte of the Or \p Depth tracks how many recursive iterations we have
13611// performed. \p StartingIndex is the originally requested byte of the Or
13612static const std::optional<ByteProvider<SDValue>>
13613calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13614 unsigned StartingIndex = 0) {
13615 // Finding Src tree of RHS of or typically requires at least 1 additional
13616 // depth
13617 if (Depth > 6)
13618 return std::nullopt;
13619
13620 unsigned BitWidth = Op.getScalarValueSizeInBits();
13621 if (BitWidth % 8 != 0)
13622 return std::nullopt;
13623 if (Index > BitWidth / 8 - 1)
13624 return std::nullopt;
13625
13626 bool IsVec = Op.getValueType().isVector();
13627 switch (Op.getOpcode()) {
13628 case ISD::OR: {
13629 if (IsVec)
13630 return std::nullopt;
13631
13632 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13633 StartingIndex);
13634 if (!RHS)
13635 return std::nullopt;
13636 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13637 StartingIndex);
13638 if (!LHS)
13639 return std::nullopt;
13640 // A well formed Or will have two ByteProviders for each byte, one of which
13641 // is constant zero
13642 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13643 return std::nullopt;
13644 if (!LHS || LHS->isConstantZero())
13645 return RHS;
13646 if (!RHS || RHS->isConstantZero())
13647 return LHS;
13648 return std::nullopt;
13649 }
13650
13651 case ISD::AND: {
13652 if (IsVec)
13653 return std::nullopt;
13654
13655 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13656 if (!BitMaskOp)
13657 return std::nullopt;
13658
13659 uint32_t BitMask = BitMaskOp->getZExtValue();
13660 // Bits we expect for our StartingIndex
13661 uint32_t IndexMask = 0xFF << (Index * 8);
13662
13663 if ((IndexMask & BitMask) != IndexMask) {
13664 // If the result of the and partially provides the byte, then it
13665 // is not well formatted
13666 if (IndexMask & BitMask)
13667 return std::nullopt;
13669 }
13670
13671 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13672 }
13673
13674 case ISD::FSHR: {
13675 if (IsVec)
13676 return std::nullopt;
13677
13678 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13679 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13680 if (!ShiftOp || Op.getValueType().isVector())
13681 return std::nullopt;
13682
13683 uint64_t BitsProvided = Op.getValueSizeInBits();
13684 if (BitsProvided % 8 != 0)
13685 return std::nullopt;
13686
13687 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13688 if (BitShift % 8)
13689 return std::nullopt;
13690
13691 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13692 uint64_t ByteShift = BitShift / 8;
13693
13694 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13695 uint64_t BytesProvided = BitsProvided / 8;
13696 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13697 NewIndex %= BytesProvided;
13698 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13699 }
13700
13701 case ISD::SRA:
13702 case ISD::SRL: {
13703 if (IsVec)
13704 return std::nullopt;
13705
13706 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13707 if (!ShiftOp)
13708 return std::nullopt;
13709
13710 uint64_t BitShift = ShiftOp->getZExtValue();
13711 if (BitShift % 8)
13712 return std::nullopt;
13713
13714 auto BitsProvided = Op.getScalarValueSizeInBits();
13715 if (BitsProvided % 8 != 0)
13716 return std::nullopt;
13717
13718 uint64_t BytesProvided = BitsProvided / 8;
13719 uint64_t ByteShift = BitShift / 8;
13720 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13721 // If the byte we are trying to provide (as tracked by index) falls in this
13722 // range, then the SRL provides the byte. The byte of interest of the src of
13723 // the SRL is Index + ByteShift
13724 return BytesProvided - ByteShift > Index
13725 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13726 Index + ByteShift)
13728 }
13729
13730 case ISD::SHL: {
13731 if (IsVec)
13732 return std::nullopt;
13733
13734 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13735 if (!ShiftOp)
13736 return std::nullopt;
13737
13738 uint64_t BitShift = ShiftOp->getZExtValue();
13739 if (BitShift % 8 != 0)
13740 return std::nullopt;
13741 uint64_t ByteShift = BitShift / 8;
13742
13743 // If we are shifting by an amount greater than (or equal to)
13744 // the index we are trying to provide, then it provides 0s. If not,
13745 // then this bytes are not definitively 0s, and the corresponding byte
13746 // of interest is Index - ByteShift of the src
13747 return Index < ByteShift
13749 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13750 Depth + 1, StartingIndex);
13751 }
13752 case ISD::ANY_EXTEND:
13753 case ISD::SIGN_EXTEND:
13754 case ISD::ZERO_EXTEND:
13756 case ISD::AssertZext:
13757 case ISD::AssertSext: {
13758 if (IsVec)
13759 return std::nullopt;
13760
13761 SDValue NarrowOp = Op->getOperand(0);
13762 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13763 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13764 Op->getOpcode() == ISD::AssertZext ||
13765 Op->getOpcode() == ISD::AssertSext) {
13766 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13767 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13768 }
13769 if (NarrowBitWidth % 8 != 0)
13770 return std::nullopt;
13771 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13772
13773 if (Index >= NarrowByteWidth)
13774 return Op.getOpcode() == ISD::ZERO_EXTEND
13775 ? std::optional<ByteProvider<SDValue>>(
13777 : std::nullopt;
13778 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13779 }
13780
13781 case ISD::TRUNCATE: {
13782 if (IsVec)
13783 return std::nullopt;
13784
13785 uint64_t NarrowByteWidth = BitWidth / 8;
13786
13787 if (NarrowByteWidth >= Index) {
13788 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13789 StartingIndex);
13790 }
13791
13792 return std::nullopt;
13793 }
13794
13795 case ISD::CopyFromReg: {
13796 if (BitWidth / 8 > Index)
13797 return calculateSrcByte(Op, StartingIndex, Index);
13798
13799 return std::nullopt;
13800 }
13801
13802 case ISD::LOAD: {
13803 auto *L = cast<LoadSDNode>(Op.getNode());
13804
13805 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13806 if (NarrowBitWidth % 8 != 0)
13807 return std::nullopt;
13808 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13809
13810 // If the width of the load does not reach byte we are trying to provide for
13811 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13812 // question
13813 if (Index >= NarrowByteWidth) {
13814 return L->getExtensionType() == ISD::ZEXTLOAD
13815 ? std::optional<ByteProvider<SDValue>>(
13817 : std::nullopt;
13818 }
13819
13820 if (NarrowByteWidth > Index) {
13821 return calculateSrcByte(Op, StartingIndex, Index);
13822 }
13823
13824 return std::nullopt;
13825 }
13826
13827 case ISD::BSWAP: {
13828 if (IsVec)
13829 return std::nullopt;
13830
13831 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13832 Depth + 1, StartingIndex);
13833 }
13834
13836 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13837 if (!IdxOp)
13838 return std::nullopt;
13839 auto VecIdx = IdxOp->getZExtValue();
13840 auto ScalarSize = Op.getScalarValueSizeInBits();
13841 if (ScalarSize < 32)
13842 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13843 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13844 StartingIndex, Index);
13845 }
13846
13847 case AMDGPUISD::PERM: {
13848 if (IsVec)
13849 return std::nullopt;
13850
13851 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13852 if (!PermMask)
13853 return std::nullopt;
13854
13855 auto IdxMask =
13856 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13857 if (IdxMask > 0x07 && IdxMask != 0x0c)
13858 return std::nullopt;
13859
13860 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13861 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13862
13863 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13866 }
13867
13868 default: {
13869 return std::nullopt;
13870 }
13871 }
13872
13873 llvm_unreachable("fully handled switch");
13874}
13875
13876// Returns true if the Operand is a scalar and is 16 bits
13877static bool isExtendedFrom16Bits(SDValue &Operand) {
13878
13879 switch (Operand.getOpcode()) {
13880 case ISD::ANY_EXTEND:
13881 case ISD::SIGN_EXTEND:
13882 case ISD::ZERO_EXTEND: {
13883 auto OpVT = Operand.getOperand(0).getValueType();
13884 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13885 }
13886 case ISD::LOAD: {
13887 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13888 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13889 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13890 ExtType == ISD::EXTLOAD) {
13891 auto MemVT = L->getMemoryVT();
13892 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13893 }
13894 return L->getMemoryVT().getSizeInBits() == 16;
13895 }
13896 default:
13897 return false;
13898 }
13899}
13900
13901// Returns true if the mask matches consecutive bytes, and the first byte
13902// begins at a power of 2 byte offset from 0th byte
13903static bool addresses16Bits(int Mask) {
13904 int Low8 = Mask & 0xff;
13905 int Hi8 = (Mask & 0xff00) >> 8;
13906
13907 assert(Low8 < 8 && Hi8 < 8);
13908 // Are the bytes contiguous in the order of increasing addresses.
13909 bool IsConsecutive = (Hi8 - Low8 == 1);
13910 // Is the first byte at location that is aligned for 16 bit instructions.
13911 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13912 // In this case, we still need code to extract the 16 bit operand, so it
13913 // is better to use i8 v_perm
13914 bool Is16Aligned = !(Low8 % 2);
13915
13916 return IsConsecutive && Is16Aligned;
13917}
13918
13919// Do not lower into v_perm if the operands are actually 16 bit
13920// and the selected bits (based on PermMask) correspond with two
13921// easily addressable 16 bit operands.
13923 SDValue &OtherOp) {
13924 int Low16 = PermMask & 0xffff;
13925 int Hi16 = (PermMask & 0xffff0000) >> 16;
13926
13927 auto TempOp = peekThroughBitcasts(Op);
13928 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13929
13930 auto OpIs16Bit =
13931 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13932 if (!OpIs16Bit)
13933 return true;
13934
13935 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13936 isExtendedFrom16Bits(TempOtherOp);
13937 if (!OtherOpIs16Bit)
13938 return true;
13939
13940 // Do we cleanly address both
13941 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13942}
13943
13945 unsigned DWordOffset) {
13946 SDValue Ret;
13947
13948 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13949 // ByteProvider must be at least 8 bits
13950 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13951
13952 if (TypeSize <= 32)
13953 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13954
13955 if (Src.getValueType().isVector()) {
13956 auto ScalarTySize = Src.getScalarValueSizeInBits();
13957 auto ScalarTy = Src.getValueType().getScalarType();
13958 if (ScalarTySize == 32) {
13959 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13960 DAG.getConstant(DWordOffset, SL, MVT::i32));
13961 }
13962 if (ScalarTySize > 32) {
13963 Ret = DAG.getNode(
13964 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13965 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13966 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13967 if (ShiftVal)
13968 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13969 DAG.getConstant(ShiftVal, SL, MVT::i32));
13970 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13971 }
13972
13973 assert(ScalarTySize < 32);
13974 auto NumElements = TypeSize / ScalarTySize;
13975 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13976 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13977 auto NumElementsIn32 = 32 / ScalarTySize;
13978 auto NumAvailElements = DWordOffset < Trunc32Elements
13979 ? NumElementsIn32
13980 : NumElements - NormalizedTrunc;
13981
13983 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13984 NumAvailElements);
13985
13986 Ret = DAG.getBuildVector(
13987 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13988 VecSrcs);
13989 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13990 }
13991
13992 /// Scalar Type
13993 auto ShiftVal = 32 * DWordOffset;
13994 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13995 DAG.getConstant(ShiftVal, SL, MVT::i32));
13996 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13997}
13998
14000 SelectionDAG &DAG = DCI.DAG;
14001 [[maybe_unused]] EVT VT = N->getValueType(0);
14003
14004 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14005 assert(VT == MVT::i32);
14006 for (int i = 0; i < 4; i++) {
14007 // Find the ByteProvider that provides the ith byte of the result of OR
14008 std::optional<ByteProvider<SDValue>> P =
14009 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14010 // TODO support constantZero
14011 if (!P || P->isConstantZero())
14012 return SDValue();
14013
14014 PermNodes.push_back(*P);
14015 }
14016 if (PermNodes.size() != 4)
14017 return SDValue();
14018
14019 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14020 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14021 uint64_t PermMask = 0x00000000;
14022 for (size_t i = 0; i < PermNodes.size(); i++) {
14023 auto PermOp = PermNodes[i];
14024 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14025 // by sizeof(Src2) = 4
14026 int SrcByteAdjust = 4;
14027
14028 // If the Src uses a byte from a different DWORD, then it corresponds
14029 // with a difference source
14030 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14031 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14032 if (SecondSrc)
14033 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14034 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14035 return SDValue();
14036
14037 // Set the index of the second distinct Src node
14038 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14039 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14040 SrcByteAdjust = 0;
14041 }
14042 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14044 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14045 }
14046 SDLoc DL(N);
14047 SDValue Op = *PermNodes[FirstSrc.first].Src;
14048 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14049 assert(Op.getValueSizeInBits() == 32);
14050
14051 // Check that we are not just extracting the bytes in order from an op
14052 if (!SecondSrc) {
14053 int Low16 = PermMask & 0xffff;
14054 int Hi16 = (PermMask & 0xffff0000) >> 16;
14055
14056 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14057 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14058
14059 // The perm op would really just produce Op. So combine into Op
14060 if (WellFormedLow && WellFormedHi)
14061 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14062 }
14063
14064 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14065
14066 if (SecondSrc) {
14067 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14068 assert(OtherOp.getValueSizeInBits() == 32);
14069 }
14070
14071 // Check that we haven't just recreated the same FSHR node.
14072 if (N->getOpcode() == ISD::FSHR &&
14073 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14074 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14075 return SDValue();
14076
14077 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14078
14079 assert(Op.getValueType().isByteSized() &&
14080 OtherOp.getValueType().isByteSized());
14081
14082 // If the ultimate src is less than 32 bits, then we will only be
14083 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14084 // CalculateByteProvider would not have returned Op as source if we
14085 // used a byte that is outside its ValueType. Thus, we are free to
14086 // ANY_EXTEND as the extended bits are dont-cares.
14087 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14088 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14089
14090 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14091 DAG.getConstant(PermMask, DL, MVT::i32));
14092 }
14093 return SDValue();
14094}
14095
14096SDValue SITargetLowering::performOrCombine(SDNode *N,
14097 DAGCombinerInfo &DCI) const {
14098 SelectionDAG &DAG = DCI.DAG;
14099 SDValue LHS = N->getOperand(0);
14100 SDValue RHS = N->getOperand(1);
14101
14102 EVT VT = N->getValueType(0);
14103 if (VT == MVT::i1) {
14104 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14105 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14106 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14107 SDValue Src = LHS.getOperand(0);
14108 if (Src != RHS.getOperand(0))
14109 return SDValue();
14110
14111 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14112 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14113 if (!CLHS || !CRHS)
14114 return SDValue();
14115
14116 // Only 10 bits are used.
14117 static const uint32_t MaxMask = 0x3ff;
14118
14119 uint32_t NewMask =
14120 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14121 SDLoc DL(N);
14122 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14123 DAG.getConstant(NewMask, DL, MVT::i32));
14124 }
14125
14126 return SDValue();
14127 }
14128
14129 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14131 LHS.getOpcode() == AMDGPUISD::PERM &&
14132 isa<ConstantSDNode>(LHS.getOperand(2))) {
14133 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14134 if (!Sel)
14135 return SDValue();
14136
14137 Sel |= LHS.getConstantOperandVal(2);
14138 SDLoc DL(N);
14139 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14140 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14141 }
14142
14143 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14144 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14145 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14146 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14147
14148 // If all the uses of an or need to extract the individual elements, do not
14149 // attempt to lower into v_perm
14150 auto usesCombinedOperand = [](SDNode *OrUse) {
14151 // If we have any non-vectorized use, then it is a candidate for v_perm
14152 if (OrUse->getOpcode() != ISD::BITCAST ||
14153 !OrUse->getValueType(0).isVector())
14154 return true;
14155
14156 // If we have any non-vectorized use, then it is a candidate for v_perm
14157 for (auto *VUser : OrUse->users()) {
14158 if (!VUser->getValueType(0).isVector())
14159 return true;
14160
14161 // If the use of a vector is a store, then combining via a v_perm
14162 // is beneficial.
14163 // TODO -- whitelist more uses
14164 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14165 if (VUser->getOpcode() == VectorwiseOp)
14166 return true;
14167 }
14168 return false;
14169 };
14170
14171 if (!any_of(N->users(), usesCombinedOperand))
14172 return SDValue();
14173
14174 uint32_t LHSMask = getPermuteMask(LHS);
14175 uint32_t RHSMask = getPermuteMask(RHS);
14176
14177 if (LHSMask != ~0u && RHSMask != ~0u) {
14178 // Canonicalize the expression in an attempt to have fewer unique masks
14179 // and therefore fewer registers used to hold the masks.
14180 if (LHSMask > RHSMask) {
14181 std::swap(LHSMask, RHSMask);
14182 std::swap(LHS, RHS);
14183 }
14184
14185 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14186 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14187 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14188 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14189
14190 // Check of we need to combine values from two sources within a byte.
14191 if (!(LHSUsedLanes & RHSUsedLanes) &&
14192 // If we select high and lower word keep it for SDWA.
14193 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14194 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14195 // Kill zero bytes selected by other mask. Zero value is 0xc.
14196 LHSMask &= ~RHSUsedLanes;
14197 RHSMask &= ~LHSUsedLanes;
14198 // Add 4 to each active LHS lane
14199 LHSMask |= LHSUsedLanes & 0x04040404;
14200 // Combine masks
14201 uint32_t Sel = LHSMask | RHSMask;
14202 SDLoc DL(N);
14203
14204 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14205 RHS.getOperand(0),
14206 DAG.getConstant(Sel, DL, MVT::i32));
14207 }
14208 }
14209 if (LHSMask == ~0u || RHSMask == ~0u) {
14210 if (SDValue Perm = matchPERM(N, DCI))
14211 return Perm;
14212 }
14213 }
14214
14215 // Detect identity v2i32 OR and replace with identity source node.
14216 // Specifically an Or that has operands constructed from the same source node
14217 // via extract_vector_elt and build_vector. I.E.
14218 // v2i32 or(
14219 // v2i32 build_vector(
14220 // i32 extract_elt(%IdentitySrc, 0),
14221 // i32 0
14222 // ),
14223 // v2i32 build_vector(
14224 // i32 0,
14225 // i32 extract_elt(%IdentitySrc, 1)
14226 // ) )
14227 // =>
14228 // v2i32 %IdentitySrc
14229
14230 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14231 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14232
14233 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14234 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14235
14236 // Test for and normalise build vectors.
14237 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14238
14239 // Get the extract_vector_element operands.
14240 SDValue LEVE = LHS->getOperand(0);
14241 SDValue REVE = RHS->getOperand(1);
14242
14243 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14245 // Check that different elements from the same vector are
14246 // extracted.
14247 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14248 LEVE->getOperand(1) != REVE->getOperand(1)) {
14249 SDValue IdentitySrc = LEVE.getOperand(0);
14250 return IdentitySrc;
14251 }
14252 }
14253 }
14254 }
14255
14256 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14257 return SDValue();
14258
14259 // TODO: This could be a generic combine with a predicate for extracting the
14260 // high half of an integer being free.
14261
14262 // (or i64:x, (zero_extend i32:y)) ->
14263 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14264 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14265 RHS.getOpcode() != ISD::ZERO_EXTEND)
14266 std::swap(LHS, RHS);
14267
14268 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14269 SDValue ExtSrc = RHS.getOperand(0);
14270 EVT SrcVT = ExtSrc.getValueType();
14271 if (SrcVT == MVT::i32) {
14272 SDLoc SL(N);
14273 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14274 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14275
14276 DCI.AddToWorklist(LowOr.getNode());
14277 DCI.AddToWorklist(HiBits.getNode());
14278
14279 SDValue Vec =
14280 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14281 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14282 }
14283 }
14284
14285 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14286 if (CRHS) {
14287 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14288 N->getOperand(0), CRHS))
14289 return Split;
14290 }
14291
14292 return SDValue();
14293}
14294
14295SDValue SITargetLowering::performXorCombine(SDNode *N,
14296 DAGCombinerInfo &DCI) const {
14297 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14298 return RV;
14299
14300 SDValue LHS = N->getOperand(0);
14301 SDValue RHS = N->getOperand(1);
14302
14303 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14304 SelectionDAG &DAG = DCI.DAG;
14305
14306 EVT VT = N->getValueType(0);
14307 if (CRHS && VT == MVT::i64) {
14308 if (SDValue Split =
14309 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14310 return Split;
14311 }
14312
14313 // v2i32 (xor (vselect cc, x, y), K) ->
14314 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14315 // replaced with source modifiers when the select is lowered to CNDMASK.
14316 unsigned Opc = LHS.getOpcode();
14317 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14318 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14319 CRHS && CRHS->getAPIntValue().isSignMask()) {
14320 SDValue CC = LHS->getOperand(0);
14321 SDValue TRUE = LHS->getOperand(1);
14322 SDValue FALSE = LHS->getOperand(2);
14323 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14324 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14325 SDValue XSelect =
14326 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14327 return XSelect;
14328 }
14329
14330 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14331 // fneg-like xors into 64-bit select.
14332 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14333 // This looks like an fneg, try to fold as a source modifier.
14334 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14336 // xor (select c, a, b), 0x80000000 ->
14337 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14338 SDLoc DL(N);
14339 SDValue CastLHS =
14340 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14341 SDValue CastRHS =
14342 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14343 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14344 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14345 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14346 LHS->getOperand(0), FNegLHS, FNegRHS);
14347 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14348 }
14349 }
14350
14351 return SDValue();
14352}
14353
14354SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14355 DAGCombinerInfo &DCI) const {
14356 if (!Subtarget->has16BitInsts() ||
14357 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14358 return SDValue();
14359
14360 EVT VT = N->getValueType(0);
14361 if (VT != MVT::i32)
14362 return SDValue();
14363
14364 SDValue Src = N->getOperand(0);
14365 if (Src.getValueType() != MVT::i16)
14366 return SDValue();
14367
14368 return SDValue();
14369}
14370
14371SDValue
14372SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14373 DAGCombinerInfo &DCI) const {
14374 SDValue Src = N->getOperand(0);
14375 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14376
14377 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14378 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14379 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14380 VTSign->getVT() == MVT::i8) ||
14381 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14382 VTSign->getVT() == MVT::i16))) {
14383 assert(Subtarget->hasScalarSubwordLoads() &&
14384 "s_buffer_load_{u8, i8} are supported "
14385 "in GFX12 (or newer) architectures.");
14386 EVT VT = Src.getValueType();
14387 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14388 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14389 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14390 SDLoc DL(N);
14391 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14392 SDValue Ops[] = {
14393 Src.getOperand(0), // source register
14394 Src.getOperand(1), // offset
14395 Src.getOperand(2) // cachePolicy
14396 };
14397 auto *M = cast<MemSDNode>(Src);
14398 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14399 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14400 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14401 return LoadVal;
14402 }
14403 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14404 VTSign->getVT() == MVT::i8) ||
14405 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14406 VTSign->getVT() == MVT::i16)) &&
14407 Src.hasOneUse()) {
14408 auto *M = cast<MemSDNode>(Src);
14409 SDValue Ops[] = {Src.getOperand(0), // Chain
14410 Src.getOperand(1), // rsrc
14411 Src.getOperand(2), // vindex
14412 Src.getOperand(3), // voffset
14413 Src.getOperand(4), // soffset
14414 Src.getOperand(5), // offset
14415 Src.getOperand(6), Src.getOperand(7)};
14416 // replace with BUFFER_LOAD_BYTE/SHORT
14417 SDVTList ResList =
14418 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14419 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14420 ? AMDGPUISD::BUFFER_LOAD_BYTE
14421 : AMDGPUISD::BUFFER_LOAD_SHORT;
14422 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14423 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14424 return DCI.DAG.getMergeValues(
14425 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14426 }
14427 return SDValue();
14428}
14429
14430SDValue SITargetLowering::performClassCombine(SDNode *N,
14431 DAGCombinerInfo &DCI) const {
14432 SelectionDAG &DAG = DCI.DAG;
14433 SDValue Mask = N->getOperand(1);
14434
14435 // fp_class x, 0 -> false
14436 if (isNullConstant(Mask))
14437 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14438
14439 if (N->getOperand(0).isUndef())
14440 return DAG.getUNDEF(MVT::i1);
14441
14442 return SDValue();
14443}
14444
14445SDValue SITargetLowering::performRcpCombine(SDNode *N,
14446 DAGCombinerInfo &DCI) const {
14447 EVT VT = N->getValueType(0);
14448 SDValue N0 = N->getOperand(0);
14449
14450 if (N0.isUndef()) {
14451 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14452 SDLoc(N), VT);
14453 }
14454
14455 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14456 N0.getOpcode() == ISD::SINT_TO_FP)) {
14457 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14458 N->getFlags());
14459 }
14460
14461 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14462 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14463 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14464 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14465 N->getFlags());
14466 }
14467
14469}
14470
14472 unsigned MaxDepth) const {
14473 unsigned Opcode = Op.getOpcode();
14474 if (Opcode == ISD::FCANONICALIZE)
14475 return true;
14476
14477 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14478 const auto &F = CFP->getValueAPF();
14479 if (F.isNaN() && F.isSignaling())
14480 return false;
14481 if (!F.isDenormal())
14482 return true;
14483
14484 DenormalMode Mode =
14485 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14486 return Mode == DenormalMode::getIEEE();
14487 }
14488
14489 // If source is a result of another standard FP operation it is already in
14490 // canonical form.
14491 if (MaxDepth == 0)
14492 return false;
14493
14494 switch (Opcode) {
14495 // These will flush denorms if required.
14496 case ISD::FADD:
14497 case ISD::FSUB:
14498 case ISD::FMUL:
14499 case ISD::FCEIL:
14500 case ISD::FFLOOR:
14501 case ISD::FMA:
14502 case ISD::FMAD:
14503 case ISD::FSQRT:
14504 case ISD::FDIV:
14505 case ISD::FREM:
14506 case ISD::FP_ROUND:
14507 case ISD::FP_EXTEND:
14508 case ISD::FP16_TO_FP:
14509 case ISD::FP_TO_FP16:
14510 case ISD::BF16_TO_FP:
14511 case ISD::FP_TO_BF16:
14512 case ISD::FLDEXP:
14513 case AMDGPUISD::FMUL_LEGACY:
14514 case AMDGPUISD::FMAD_FTZ:
14515 case AMDGPUISD::RCP:
14516 case AMDGPUISD::RSQ:
14517 case AMDGPUISD::RSQ_CLAMP:
14518 case AMDGPUISD::RCP_LEGACY:
14519 case AMDGPUISD::RCP_IFLAG:
14520 case AMDGPUISD::LOG:
14521 case AMDGPUISD::EXP:
14522 case AMDGPUISD::DIV_SCALE:
14523 case AMDGPUISD::DIV_FMAS:
14524 case AMDGPUISD::DIV_FIXUP:
14525 case AMDGPUISD::FRACT:
14526 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14527 case AMDGPUISD::CVT_F32_UBYTE0:
14528 case AMDGPUISD::CVT_F32_UBYTE1:
14529 case AMDGPUISD::CVT_F32_UBYTE2:
14530 case AMDGPUISD::CVT_F32_UBYTE3:
14531 case AMDGPUISD::FP_TO_FP16:
14532 case AMDGPUISD::SIN_HW:
14533 case AMDGPUISD::COS_HW:
14534 return true;
14535
14536 // It can/will be lowered or combined as a bit operation.
14537 // Need to check their input recursively to handle.
14538 case ISD::FNEG:
14539 case ISD::FABS:
14540 case ISD::FCOPYSIGN:
14541 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14542
14543 case ISD::AND:
14544 if (Op.getValueType() == MVT::i32) {
14545 // Be careful as we only know it is a bitcast floating point type. It
14546 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14547 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14548 // is valid to optimize for all types.
14549 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14550 if (RHS->getZExtValue() == 0xffff0000) {
14551 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14552 }
14553 }
14554 }
14555 break;
14556
14557 case ISD::FSIN:
14558 case ISD::FCOS:
14559 case ISD::FSINCOS:
14560 return Op.getValueType().getScalarType() != MVT::f16;
14561
14562 case ISD::FMINNUM:
14563 case ISD::FMAXNUM:
14564 case ISD::FMINNUM_IEEE:
14565 case ISD::FMAXNUM_IEEE:
14566 case ISD::FMINIMUM:
14567 case ISD::FMAXIMUM:
14568 case ISD::FMINIMUMNUM:
14569 case ISD::FMAXIMUMNUM:
14570 case AMDGPUISD::CLAMP:
14571 case AMDGPUISD::FMED3:
14572 case AMDGPUISD::FMAX3:
14573 case AMDGPUISD::FMIN3:
14574 case AMDGPUISD::FMAXIMUM3:
14575 case AMDGPUISD::FMINIMUM3: {
14576 // FIXME: Shouldn't treat the generic operations different based these.
14577 // However, we aren't really required to flush the result from
14578 // minnum/maxnum..
14579
14580 // snans will be quieted, so we only need to worry about denormals.
14581 if (Subtarget->supportsMinMaxDenormModes() ||
14582 // FIXME: denormalsEnabledForType is broken for dynamic
14583 denormalsEnabledForType(DAG, Op.getValueType()))
14584 return true;
14585
14586 // Flushing may be required.
14587 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14588 // targets need to check their input recursively.
14589
14590 // FIXME: Does this apply with clamp? It's implemented with max.
14591 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14592 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14593 return false;
14594 }
14595
14596 return true;
14597 }
14598 case ISD::SELECT: {
14599 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14600 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14601 }
14602 case ISD::BUILD_VECTOR: {
14603 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14604 SDValue SrcOp = Op.getOperand(i);
14605 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14606 return false;
14607 }
14608
14609 return true;
14610 }
14613 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14614 }
14616 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14617 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14618 }
14619 case ISD::UNDEF:
14620 // Could be anything.
14621 return false;
14622
14623 case ISD::BITCAST:
14624 // TODO: This is incorrect as it loses track of the operand's type. We may
14625 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14626 // same bits that are canonicalized in one type need not be in the other.
14627 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14628 case ISD::TRUNCATE: {
14629 // Hack round the mess we make when legalizing extract_vector_elt
14630 if (Op.getValueType() == MVT::i16) {
14631 SDValue TruncSrc = Op.getOperand(0);
14632 if (TruncSrc.getValueType() == MVT::i32 &&
14633 TruncSrc.getOpcode() == ISD::BITCAST &&
14634 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14635 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14636 }
14637 }
14638 return false;
14639 }
14641 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14642 // TODO: Handle more intrinsics
14643 switch (IntrinsicID) {
14644 case Intrinsic::amdgcn_cvt_pkrtz:
14645 case Intrinsic::amdgcn_cubeid:
14646 case Intrinsic::amdgcn_frexp_mant:
14647 case Intrinsic::amdgcn_fdot2:
14648 case Intrinsic::amdgcn_rcp:
14649 case Intrinsic::amdgcn_rsq:
14650 case Intrinsic::amdgcn_rsq_clamp:
14651 case Intrinsic::amdgcn_rcp_legacy:
14652 case Intrinsic::amdgcn_rsq_legacy:
14653 case Intrinsic::amdgcn_trig_preop:
14654 case Intrinsic::amdgcn_tanh:
14655 case Intrinsic::amdgcn_log:
14656 case Intrinsic::amdgcn_exp2:
14657 case Intrinsic::amdgcn_sqrt:
14658 return true;
14659 default:
14660 break;
14661 }
14662
14663 break;
14664 }
14665 default:
14666 break;
14667 }
14668
14669 // FIXME: denormalsEnabledForType is broken for dynamic
14670 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14671 DAG.isKnownNeverSNaN(Op);
14672}
14673
14675 unsigned MaxDepth) const {
14676 const MachineRegisterInfo &MRI = MF.getRegInfo();
14677 MachineInstr *MI = MRI.getVRegDef(Reg);
14678 unsigned Opcode = MI->getOpcode();
14679
14680 if (Opcode == AMDGPU::G_FCANONICALIZE)
14681 return true;
14682
14683 std::optional<FPValueAndVReg> FCR;
14684 // Constant splat (can be padded with undef) or scalar constant.
14686 if (FCR->Value.isSignaling())
14687 return false;
14688 if (!FCR->Value.isDenormal())
14689 return true;
14690
14691 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14692 return Mode == DenormalMode::getIEEE();
14693 }
14694
14695 if (MaxDepth == 0)
14696 return false;
14697
14698 switch (Opcode) {
14699 case AMDGPU::G_FADD:
14700 case AMDGPU::G_FSUB:
14701 case AMDGPU::G_FMUL:
14702 case AMDGPU::G_FCEIL:
14703 case AMDGPU::G_FFLOOR:
14704 case AMDGPU::G_FRINT:
14705 case AMDGPU::G_FNEARBYINT:
14706 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14707 case AMDGPU::G_INTRINSIC_TRUNC:
14708 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14709 case AMDGPU::G_FMA:
14710 case AMDGPU::G_FMAD:
14711 case AMDGPU::G_FSQRT:
14712 case AMDGPU::G_FDIV:
14713 case AMDGPU::G_FREM:
14714 case AMDGPU::G_FPOW:
14715 case AMDGPU::G_FPEXT:
14716 case AMDGPU::G_FLOG:
14717 case AMDGPU::G_FLOG2:
14718 case AMDGPU::G_FLOG10:
14719 case AMDGPU::G_FPTRUNC:
14720 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14721 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14722 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14723 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14724 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14725 return true;
14726 case AMDGPU::G_FNEG:
14727 case AMDGPU::G_FABS:
14728 case AMDGPU::G_FCOPYSIGN:
14729 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14730 case AMDGPU::G_FMINNUM:
14731 case AMDGPU::G_FMAXNUM:
14732 case AMDGPU::G_FMINNUM_IEEE:
14733 case AMDGPU::G_FMAXNUM_IEEE:
14734 case AMDGPU::G_FMINIMUM:
14735 case AMDGPU::G_FMAXIMUM:
14736 case AMDGPU::G_FMINIMUMNUM:
14737 case AMDGPU::G_FMAXIMUMNUM: {
14738 if (Subtarget->supportsMinMaxDenormModes() ||
14739 // FIXME: denormalsEnabledForType is broken for dynamic
14740 denormalsEnabledForType(MRI.getType(Reg), MF))
14741 return true;
14742
14743 [[fallthrough]];
14744 }
14745 case AMDGPU::G_BUILD_VECTOR:
14746 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14747 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14748 return false;
14749 return true;
14750 case AMDGPU::G_INTRINSIC:
14751 case AMDGPU::G_INTRINSIC_CONVERGENT:
14752 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14753 case Intrinsic::amdgcn_fmul_legacy:
14754 case Intrinsic::amdgcn_fmad_ftz:
14755 case Intrinsic::amdgcn_sqrt:
14756 case Intrinsic::amdgcn_fmed3:
14757 case Intrinsic::amdgcn_sin:
14758 case Intrinsic::amdgcn_cos:
14759 case Intrinsic::amdgcn_log:
14760 case Intrinsic::amdgcn_exp2:
14761 case Intrinsic::amdgcn_log_clamp:
14762 case Intrinsic::amdgcn_rcp:
14763 case Intrinsic::amdgcn_rcp_legacy:
14764 case Intrinsic::amdgcn_rsq:
14765 case Intrinsic::amdgcn_rsq_clamp:
14766 case Intrinsic::amdgcn_rsq_legacy:
14767 case Intrinsic::amdgcn_div_scale:
14768 case Intrinsic::amdgcn_div_fmas:
14769 case Intrinsic::amdgcn_div_fixup:
14770 case Intrinsic::amdgcn_fract:
14771 case Intrinsic::amdgcn_cvt_pkrtz:
14772 case Intrinsic::amdgcn_cubeid:
14773 case Intrinsic::amdgcn_cubema:
14774 case Intrinsic::amdgcn_cubesc:
14775 case Intrinsic::amdgcn_cubetc:
14776 case Intrinsic::amdgcn_frexp_mant:
14777 case Intrinsic::amdgcn_fdot2:
14778 case Intrinsic::amdgcn_trig_preop:
14779 case Intrinsic::amdgcn_tanh:
14780 return true;
14781 default:
14782 break;
14783 }
14784
14785 [[fallthrough]];
14786 default:
14787 return false;
14788 }
14789
14790 llvm_unreachable("invalid operation");
14791}
14792
14793// Constant fold canonicalize.
14794SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14795 const SDLoc &SL, EVT VT,
14796 const APFloat &C) const {
14797 // Flush denormals to 0 if not enabled.
14798 if (C.isDenormal()) {
14799 DenormalMode Mode =
14800 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14801 if (Mode == DenormalMode::getPreserveSign()) {
14802 return DAG.getConstantFP(
14803 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14804 }
14805
14806 if (Mode != DenormalMode::getIEEE())
14807 return SDValue();
14808 }
14809
14810 if (C.isNaN()) {
14811 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14812 if (C.isSignaling()) {
14813 // Quiet a signaling NaN.
14814 // FIXME: Is this supposed to preserve payload bits?
14815 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14816 }
14817
14818 // Make sure it is the canonical NaN bitpattern.
14819 //
14820 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14821 // immediate?
14822 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14823 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14824 }
14825
14826 // Already canonical.
14827 return DAG.getConstantFP(C, SL, VT);
14828}
14829
14831 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14832}
14833
14834SDValue
14835SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14836 DAGCombinerInfo &DCI) const {
14837 SelectionDAG &DAG = DCI.DAG;
14838 SDValue N0 = N->getOperand(0);
14839 EVT VT = N->getValueType(0);
14840
14841 // fcanonicalize undef -> qnan
14842 if (N0.isUndef()) {
14844 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14845 }
14846
14847 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14848 EVT VT = N->getValueType(0);
14849 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14850 }
14851
14852 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14853 // (fcanonicalize k)
14854 //
14855 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14856
14857 // TODO: This could be better with wider vectors that will be split to v2f16,
14858 // and to consider uses since there aren't that many packed operations.
14859 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14860 isTypeLegal(MVT::v2f16)) {
14861 SDLoc SL(N);
14862 SDValue NewElts[2];
14863 SDValue Lo = N0.getOperand(0);
14864 SDValue Hi = N0.getOperand(1);
14865 EVT EltVT = Lo.getValueType();
14866
14868 for (unsigned I = 0; I != 2; ++I) {
14869 SDValue Op = N0.getOperand(I);
14870 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14871 NewElts[I] =
14872 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14873 } else if (Op.isUndef()) {
14874 // Handled below based on what the other operand is.
14875 NewElts[I] = Op;
14876 } else {
14877 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14878 }
14879 }
14880
14881 // If one half is undef, and one is constant, prefer a splat vector rather
14882 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14883 // cheaper to use and may be free with a packed operation.
14884 if (NewElts[0].isUndef()) {
14885 if (isa<ConstantFPSDNode>(NewElts[1]))
14886 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14887 ? NewElts[1]
14888 : DAG.getConstantFP(0.0f, SL, EltVT);
14889 }
14890
14891 if (NewElts[1].isUndef()) {
14892 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14893 ? NewElts[0]
14894 : DAG.getConstantFP(0.0f, SL, EltVT);
14895 }
14896
14897 return DAG.getBuildVector(VT, SL, NewElts);
14898 }
14899 }
14900
14901 return SDValue();
14902}
14903
14904static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14905 switch (Opc) {
14906 case ISD::FMAXNUM:
14907 case ISD::FMAXNUM_IEEE:
14908 case ISD::FMAXIMUMNUM:
14909 return AMDGPUISD::FMAX3;
14910 case ISD::FMAXIMUM:
14911 return AMDGPUISD::FMAXIMUM3;
14912 case ISD::SMAX:
14913 return AMDGPUISD::SMAX3;
14914 case ISD::UMAX:
14915 return AMDGPUISD::UMAX3;
14916 case ISD::FMINNUM:
14917 case ISD::FMINNUM_IEEE:
14918 case ISD::FMINIMUMNUM:
14919 return AMDGPUISD::FMIN3;
14920 case ISD::FMINIMUM:
14921 return AMDGPUISD::FMINIMUM3;
14922 case ISD::SMIN:
14923 return AMDGPUISD::SMIN3;
14924 case ISD::UMIN:
14925 return AMDGPUISD::UMIN3;
14926 default:
14927 llvm_unreachable("Not a min/max opcode");
14928 }
14929}
14930
14931SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14932 const SDLoc &SL, SDValue Src,
14933 SDValue MinVal,
14934 SDValue MaxVal,
14935 bool Signed) const {
14936
14937 // med3 comes from
14938 // min(max(x, K0), K1), K0 < K1
14939 // max(min(x, K0), K1), K1 < K0
14940 //
14941 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14942 // min/max op.
14943 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14944 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14945
14946 if (!MinK || !MaxK)
14947 return SDValue();
14948
14949 if (Signed) {
14950 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14951 return SDValue();
14952 } else {
14953 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14954 return SDValue();
14955 }
14956
14957 EVT VT = MinK->getValueType(0);
14958 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14959 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14960 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14961
14962 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14963 // not available, but this is unlikely to be profitable as constants
14964 // will often need to be materialized & extended, especially on
14965 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14966 return SDValue();
14967}
14968
14971 return C;
14972
14974 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14975 return C;
14976 }
14977
14978 return nullptr;
14979}
14980
14981SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14982 const SDLoc &SL, SDValue Op0,
14983 SDValue Op1) const {
14984 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14985 if (!K1)
14986 return SDValue();
14987
14988 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14989 if (!K0)
14990 return SDValue();
14991
14992 // Ordered >= (although NaN inputs should have folded away by now).
14993 if (K0->getValueAPF() > K1->getValueAPF())
14994 return SDValue();
14995
14996 // med3 with a nan input acts like
14997 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14998 //
14999 // So the result depends on whether the IEEE mode bit is enabled or not with a
15000 // signaling nan input.
15001 // ieee=1
15002 // s0 snan: yields s2
15003 // s1 snan: yields s2
15004 // s2 snan: qnan
15005
15006 // s0 qnan: min(s1, s2)
15007 // s1 qnan: min(s0, s2)
15008 // s2 qnan: min(s0, s1)
15009
15010 // ieee=0
15011 // s0 snan: min(s1, s2)
15012 // s1 snan: min(s0, s2)
15013 // s2 snan: qnan
15014
15015 // s0 qnan: min(s1, s2)
15016 // s1 qnan: min(s0, s2)
15017 // s2 qnan: min(s0, s1)
15018 const MachineFunction &MF = DAG.getMachineFunction();
15019 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15020
15021 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15022 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15023 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15024 EVT VT = Op0.getValueType();
15025 if (Info->getMode().DX10Clamp) {
15026 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15027 // hardware fmed3 behavior converting to a min.
15028 // FIXME: Should this be allowing -0.0?
15029 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15030 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15031 }
15032
15033 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15034 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15035 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15036 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15037 // then give the other result, which is different from med3 with a NaN
15038 // input.
15039 SDValue Var = Op0.getOperand(0);
15040 if (!DAG.isKnownNeverSNaN(Var))
15041 return SDValue();
15042
15043 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15044
15045 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15046 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15047 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15048 SDValue(K0, 0), SDValue(K1, 0));
15049 }
15050 }
15051
15052 return SDValue();
15053}
15054
15055/// \return true if the subtarget supports minimum3 and maximum3 with the given
15056/// base min/max opcode \p Opc for type \p VT.
15057static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15058 EVT VT) {
15059 switch (Opc) {
15060 case ISD::FMINNUM:
15061 case ISD::FMAXNUM:
15062 case ISD::FMINNUM_IEEE:
15063 case ISD::FMAXNUM_IEEE:
15064 case ISD::FMINIMUMNUM:
15065 case ISD::FMAXIMUMNUM:
15066 case AMDGPUISD::FMIN_LEGACY:
15067 case AMDGPUISD::FMAX_LEGACY:
15068 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15069 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15070 case ISD::FMINIMUM:
15071 case ISD::FMAXIMUM:
15072 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15073 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15074 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15075 case ISD::SMAX:
15076 case ISD::SMIN:
15077 case ISD::UMAX:
15078 case ISD::UMIN:
15079 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15080 default:
15081 return false;
15082 }
15083
15084 llvm_unreachable("not a min/max opcode");
15085}
15086
15087SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15088 DAGCombinerInfo &DCI) const {
15089 SelectionDAG &DAG = DCI.DAG;
15090
15091 EVT VT = N->getValueType(0);
15092 unsigned Opc = N->getOpcode();
15093 SDValue Op0 = N->getOperand(0);
15094 SDValue Op1 = N->getOperand(1);
15095
15096 // Only do this if the inner op has one use since this will just increases
15097 // register pressure for no benefit.
15098
15099 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15100 // max(max(a, b), c) -> max3(a, b, c)
15101 // min(min(a, b), c) -> min3(a, b, c)
15102 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15103 SDLoc DL(N);
15104 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15105 Op0.getOperand(0), Op0.getOperand(1), Op1);
15106 }
15107
15108 // Try commuted.
15109 // max(a, max(b, c)) -> max3(a, b, c)
15110 // min(a, min(b, c)) -> min3(a, b, c)
15111 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15112 SDLoc DL(N);
15113 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15114 Op0, Op1.getOperand(0), Op1.getOperand(1));
15115 }
15116 }
15117
15118 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15119 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15120 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15121 if (SDValue Med3 = performIntMed3ImmCombine(
15122 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15123 return Med3;
15124 }
15125 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15126 if (SDValue Med3 = performIntMed3ImmCombine(
15127 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15128 return Med3;
15129 }
15130
15131 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15132 if (SDValue Med3 = performIntMed3ImmCombine(
15133 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15134 return Med3;
15135 }
15136 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15137 if (SDValue Med3 = performIntMed3ImmCombine(
15138 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15139 return Med3;
15140 }
15141
15142 // if !is_snan(x):
15143 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15144 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15145 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15146 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15147 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15150 (Opc == AMDGPUISD::FMIN_LEGACY &&
15151 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15152 (VT == MVT::f32 || VT == MVT::f64 ||
15153 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15154 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15155 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15156 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15157 Op0.hasOneUse()) {
15158 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15159 return Res;
15160 }
15161
15162 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15163 // for some types, but at a higher cost since it's implemented with a 3
15164 // operand form.
15165 const SDNodeFlags Flags = N->getFlags();
15166 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
15167 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
15168 unsigned NewOpc =
15170 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15171 }
15172
15173 return SDValue();
15174}
15175
15179 // FIXME: Should this be allowing -0.0?
15180 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15181 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15182 }
15183 }
15184
15185 return false;
15186}
15187
15188// FIXME: Should only worry about snans for version with chain.
15189SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15190 DAGCombinerInfo &DCI) const {
15191 EVT VT = N->getValueType(0);
15192 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15193 // NaNs. With a NaN input, the order of the operands may change the result.
15194
15195 SelectionDAG &DAG = DCI.DAG;
15196 SDLoc SL(N);
15197
15198 SDValue Src0 = N->getOperand(0);
15199 SDValue Src1 = N->getOperand(1);
15200 SDValue Src2 = N->getOperand(2);
15201
15202 if (isClampZeroToOne(Src0, Src1)) {
15203 // const_a, const_b, x -> clamp is safe in all cases including signaling
15204 // nans.
15205 // FIXME: Should this be allowing -0.0?
15206 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15207 }
15208
15209 const MachineFunction &MF = DAG.getMachineFunction();
15210 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15211
15212 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15213 // handling no dx10-clamp?
15214 if (Info->getMode().DX10Clamp) {
15215 // If NaNs is clamped to 0, we are free to reorder the inputs.
15216
15217 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15218 std::swap(Src0, Src1);
15219
15220 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15221 std::swap(Src1, Src2);
15222
15223 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15224 std::swap(Src0, Src1);
15225
15226 if (isClampZeroToOne(Src1, Src2))
15227 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15228 }
15229
15230 return SDValue();
15231}
15232
15233SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15234 DAGCombinerInfo &DCI) const {
15235 SDValue Src0 = N->getOperand(0);
15236 SDValue Src1 = N->getOperand(1);
15237 if (Src0.isUndef() && Src1.isUndef())
15238 return DCI.DAG.getUNDEF(N->getValueType(0));
15239 return SDValue();
15240}
15241
15242// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15243// expanded into a set of cmp/select instructions.
15245 unsigned NumElem,
15246 bool IsDivergentIdx,
15247 const GCNSubtarget *Subtarget) {
15249 return false;
15250
15251 unsigned VecSize = EltSize * NumElem;
15252
15253 // Sub-dword vectors of size 2 dword or less have better implementation.
15254 if (VecSize <= 64 && EltSize < 32)
15255 return false;
15256
15257 // Always expand the rest of sub-dword instructions, otherwise it will be
15258 // lowered via memory.
15259 if (EltSize < 32)
15260 return true;
15261
15262 // Always do this if var-idx is divergent, otherwise it will become a loop.
15263 if (IsDivergentIdx)
15264 return true;
15265
15266 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15267 unsigned NumInsts = NumElem /* Number of compares */ +
15268 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15269
15270 // On some architectures (GFX9) movrel is not available and it's better
15271 // to expand.
15272 if (Subtarget->useVGPRIndexMode())
15273 return NumInsts <= 16;
15274
15275 // If movrel is available, use it instead of expanding for vector of 8
15276 // elements.
15277 if (Subtarget->hasMovrel())
15278 return NumInsts <= 15;
15279
15280 return true;
15281}
15282
15284 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15285 if (isa<ConstantSDNode>(Idx))
15286 return false;
15287
15288 SDValue Vec = N->getOperand(0);
15289 EVT VecVT = Vec.getValueType();
15290 EVT EltVT = VecVT.getVectorElementType();
15291 unsigned EltSize = EltVT.getSizeInBits();
15292 unsigned NumElem = VecVT.getVectorNumElements();
15293
15295 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15296}
15297
15298SDValue
15299SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15300 DAGCombinerInfo &DCI) const {
15301 SDValue Vec = N->getOperand(0);
15302 SelectionDAG &DAG = DCI.DAG;
15303
15304 EVT VecVT = Vec.getValueType();
15305 EVT VecEltVT = VecVT.getVectorElementType();
15306 EVT ResVT = N->getValueType(0);
15307
15308 unsigned VecSize = VecVT.getSizeInBits();
15309 unsigned VecEltSize = VecEltVT.getSizeInBits();
15310
15311 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15313 SDLoc SL(N);
15314 SDValue Idx = N->getOperand(1);
15315 SDValue Elt =
15316 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15317 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15318 }
15319
15320 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15321 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15322 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15323 // depending on the shift operand. See e.g. performSraCombine().
15324 // This combine ensures that the optimisation is compatible with v2i32
15325 // legalised AND.
15326 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15327 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15328
15330 if (!C || C->getZExtValue() != 0x1f)
15331 return SDValue();
15332
15333 SDLoc SL(N);
15334 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15335 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15336 Vec->getOperand(0), N->getOperand(1));
15337 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15338 DAG.ReplaceAllUsesWith(N, A.getNode());
15339 }
15340
15341 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15342 // =>
15343 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15344 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15345 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15346 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15347 SDLoc SL(N);
15348 SDValue Idx = N->getOperand(1);
15349 unsigned Opc = Vec.getOpcode();
15350
15351 switch (Opc) {
15352 default:
15353 break;
15354 // TODO: Support other binary operations.
15355 case ISD::FADD:
15356 case ISD::FSUB:
15357 case ISD::FMUL:
15358 case ISD::ADD:
15359 case ISD::UMIN:
15360 case ISD::UMAX:
15361 case ISD::SMIN:
15362 case ISD::SMAX:
15363 case ISD::FMAXNUM:
15364 case ISD::FMINNUM:
15365 case ISD::FMAXNUM_IEEE:
15366 case ISD::FMINNUM_IEEE:
15367 case ISD::FMAXIMUM:
15368 case ISD::FMINIMUM: {
15369 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15370 Vec.getOperand(0), Idx);
15371 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15372 Vec.getOperand(1), Idx);
15373
15374 DCI.AddToWorklist(Elt0.getNode());
15375 DCI.AddToWorklist(Elt1.getNode());
15376 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15377 }
15378 }
15379 }
15380
15381 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15383 SDLoc SL(N);
15384 SDValue Idx = N->getOperand(1);
15385 SDValue V;
15386 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15387 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15388 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15389 if (I == 0)
15390 V = Elt;
15391 else
15392 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15393 }
15394 return V;
15395 }
15396
15397 if (!DCI.isBeforeLegalize())
15398 return SDValue();
15399
15400 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15401 // elements. This exposes more load reduction opportunities by replacing
15402 // multiple small extract_vector_elements with a single 32-bit extract.
15403 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15404 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15405 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15406 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15407
15408 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15409 unsigned EltIdx = BitIndex / 32;
15410 unsigned LeftoverBitIdx = BitIndex % 32;
15411 SDLoc SL(N);
15412
15413 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15414 DCI.AddToWorklist(Cast.getNode());
15415
15416 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15417 DAG.getConstant(EltIdx, SL, MVT::i32));
15418 DCI.AddToWorklist(Elt.getNode());
15419 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15420 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15421 DCI.AddToWorklist(Srl.getNode());
15422
15423 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15424 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15425 DCI.AddToWorklist(Trunc.getNode());
15426
15427 if (VecEltVT == ResVT) {
15428 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15429 }
15430
15431 assert(ResVT.isScalarInteger());
15432 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15433 }
15434
15435 return SDValue();
15436}
15437
15438SDValue
15439SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15440 DAGCombinerInfo &DCI) const {
15441 SDValue Vec = N->getOperand(0);
15442 SDValue Idx = N->getOperand(2);
15443 EVT VecVT = Vec.getValueType();
15444 EVT EltVT = VecVT.getVectorElementType();
15445
15446 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15447 // => BUILD_VECTOR n x select (e, const-idx)
15449 return SDValue();
15450
15451 SelectionDAG &DAG = DCI.DAG;
15452 SDLoc SL(N);
15453 SDValue Ins = N->getOperand(1);
15454 EVT IdxVT = Idx.getValueType();
15455
15457 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15458 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15459 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15460 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15461 Ops.push_back(V);
15462 }
15463
15464 return DAG.getBuildVector(VecVT, SL, Ops);
15465}
15466
15467/// Return the source of an fp_extend from f16 to f32, or a converted FP
15468/// constant.
15470 if (Src.getOpcode() == ISD::FP_EXTEND &&
15471 Src.getOperand(0).getValueType() == MVT::f16) {
15472 return Src.getOperand(0);
15473 }
15474
15475 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15476 APFloat Val = CFP->getValueAPF();
15477 bool LosesInfo = true;
15479 if (!LosesInfo)
15480 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15481 }
15482
15483 return SDValue();
15484}
15485
15486SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15487 DAGCombinerInfo &DCI) const {
15488 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15489 "combine only useful on gfx8");
15490
15491 SDValue TruncSrc = N->getOperand(0);
15492 EVT VT = N->getValueType(0);
15493 if (VT != MVT::f16)
15494 return SDValue();
15495
15496 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15497 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15498 return SDValue();
15499
15500 SelectionDAG &DAG = DCI.DAG;
15501 SDLoc SL(N);
15502
15503 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15504 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15505 // casting back.
15506
15507 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15508 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15509 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15510 if (!A)
15511 return SDValue();
15512
15513 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15514 if (!B)
15515 return SDValue();
15516
15517 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15518 if (!C)
15519 return SDValue();
15520
15521 // This changes signaling nan behavior. If an input is a signaling nan, it
15522 // would have been quieted by the fpext originally. We don't care because
15523 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15524 // we would be worse off than just doing the promotion.
15525 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15526 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15527 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15528 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15529}
15530
15531unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15532 const SDNode *N0,
15533 const SDNode *N1) const {
15534 EVT VT = N0->getValueType(0);
15535
15536 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15537 // support denormals ever.
15538 if (((VT == MVT::f32 &&
15540 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15543 return ISD::FMAD;
15544
15545 const TargetOptions &Options = DAG.getTarget().Options;
15546 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15547 (N0->getFlags().hasAllowContract() &&
15548 N1->getFlags().hasAllowContract())) &&
15550 return ISD::FMA;
15551 }
15552
15553 return 0;
15554}
15555
15556// For a reassociatable opcode perform:
15557// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15558SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15559 SelectionDAG &DAG) const {
15560 EVT VT = N->getValueType(0);
15561 if (VT != MVT::i32 && VT != MVT::i64)
15562 return SDValue();
15563
15564 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15565 return SDValue();
15566
15567 unsigned Opc = N->getOpcode();
15568 SDValue Op0 = N->getOperand(0);
15569 SDValue Op1 = N->getOperand(1);
15570
15571 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15572 return SDValue();
15573
15574 if (Op0->isDivergent())
15575 std::swap(Op0, Op1);
15576
15577 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15578 return SDValue();
15579
15580 SDValue Op2 = Op1.getOperand(1);
15581 Op1 = Op1.getOperand(0);
15582 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15583 return SDValue();
15584
15585 if (Op1->isDivergent())
15586 std::swap(Op1, Op2);
15587
15588 SDLoc SL(N);
15589 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15590 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15591}
15592
15593static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15594 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15596 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15597 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15598 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15599}
15600
15601// Fold
15602// y = lshr i64 x, 32
15603// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15604// with Const.hi == -1
15605// To
15606// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15608 SDValue MulLHS, SDValue MulRHS,
15609 SDValue AddRHS) {
15610 if (MulRHS.getOpcode() == ISD::SRL)
15611 std::swap(MulLHS, MulRHS);
15612
15613 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15614 return SDValue();
15615
15616 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15617 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15618 MulLHS.getOperand(0) != AddRHS)
15619 return SDValue();
15620
15622 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15623 return SDValue();
15624
15625 SDValue ConstMul =
15626 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15627 return getMad64_32(DAG, SL, MVT::i64,
15628 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15629 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15630}
15631
15632// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15633// multiplies, if any.
15634//
15635// Full 64-bit multiplies that feed into an addition are lowered here instead
15636// of using the generic expansion. The generic expansion ends up with
15637// a tree of ADD nodes that prevents us from using the "add" part of the
15638// MAD instruction. The expansion produced here results in a chain of ADDs
15639// instead of a tree.
15640SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15641 DAGCombinerInfo &DCI) const {
15642 assert(N->isAnyAdd());
15643
15644 SelectionDAG &DAG = DCI.DAG;
15645 EVT VT = N->getValueType(0);
15646 SDLoc SL(N);
15647 SDValue LHS = N->getOperand(0);
15648 SDValue RHS = N->getOperand(1);
15649
15650 if (VT.isVector())
15651 return SDValue();
15652
15653 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15654 // result in scalar registers for uniform values.
15655 if (!N->isDivergent() && Subtarget->hasSMulHi())
15656 return SDValue();
15657
15658 unsigned NumBits = VT.getScalarSizeInBits();
15659 if (NumBits <= 32 || NumBits > 64)
15660 return SDValue();
15661
15662 if (LHS.getOpcode() != ISD::MUL) {
15663 assert(RHS.getOpcode() == ISD::MUL);
15664 std::swap(LHS, RHS);
15665 }
15666
15667 // Avoid the fold if it would unduly increase the number of multiplies due to
15668 // multiple uses, except on hardware with full-rate multiply-add (which is
15669 // part of full-rate 64-bit ops).
15670 if (!Subtarget->hasFullRate64Ops()) {
15671 unsigned NumUsers = 0;
15672 for (SDNode *User : LHS->users()) {
15673 // There is a use that does not feed into addition, so the multiply can't
15674 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15675 if (!User->isAnyAdd())
15676 return SDValue();
15677
15678 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15679 // MUL + 3xADD + 3xADDC over 3xMAD.
15680 ++NumUsers;
15681 if (NumUsers >= 3)
15682 return SDValue();
15683 }
15684 }
15685
15686 SDValue MulLHS = LHS.getOperand(0);
15687 SDValue MulRHS = LHS.getOperand(1);
15688 SDValue AddRHS = RHS;
15689
15690 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15691 return FoldedMAD;
15692
15693 // Always check whether operands are small unsigned values, since that
15694 // knowledge is useful in more cases. Check for small signed values only if
15695 // doing so can unlock a shorter code sequence.
15696 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15697 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15698
15699 bool MulSignedLo = false;
15700 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15701 MulSignedLo =
15702 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15703 }
15704
15705 // The operands and final result all have the same number of bits. If
15706 // operands need to be extended, they can be extended with garbage. The
15707 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15708 // truncated away in the end.
15709 if (VT != MVT::i64) {
15710 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15711 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15712 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15713 }
15714
15715 // The basic code generated is conceptually straightforward. Pseudo code:
15716 //
15717 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15718 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15719 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15720 //
15721 // The second and third lines are optional, depending on whether the factors
15722 // are {sign,zero}-extended or not.
15723 //
15724 // The actual DAG is noisier than the pseudo code, but only due to
15725 // instructions that disassemble values into low and high parts, and
15726 // assemble the final result.
15727 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15728
15729 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15730 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15731 SDValue Accum =
15732 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15733
15734 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15735 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15736
15737 if (!MulLHSUnsigned32) {
15738 auto MulLHSHi =
15739 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15740 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15741 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15742 }
15743
15744 if (!MulRHSUnsigned32) {
15745 auto MulRHSHi =
15746 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15747 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15748 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15749 }
15750
15751 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15752 Accum = DAG.getBitcast(MVT::i64, Accum);
15753 }
15754
15755 if (VT != MVT::i64)
15756 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15757 return Accum;
15758}
15759
15760SDValue
15761SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15762 DAGCombinerInfo &DCI) const {
15763 SDValue RHS = N->getOperand(1);
15764 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15765 if (!CRHS)
15766 return SDValue();
15767
15768 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15769 // common.
15770 uint64_t Val = CRHS->getZExtValue();
15771 if (countr_zero(Val) >= 32) {
15772 SelectionDAG &DAG = DCI.DAG;
15773 SDLoc SL(N);
15774 SDValue LHS = N->getOperand(0);
15775
15776 // Avoid carry machinery if we know the low half of the add does not
15777 // contribute to the final result.
15778 //
15779 // add i64:x, K if computeTrailingZeros(K) >= 32
15780 // => build_pair (add x.hi, K.hi), x.lo
15781
15782 // Breaking the 64-bit add here with this strange constant is unlikely
15783 // to interfere with addressing mode patterns.
15784
15785 SDValue Hi = getHiHalf64(LHS, DAG);
15786 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15787 unsigned Opcode = N->getOpcode();
15788 if (Opcode == ISD::PTRADD)
15789 Opcode = ISD::ADD;
15790 SDValue AddHi =
15791 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15792
15793 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15794 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15795 }
15796
15797 return SDValue();
15798}
15799
15800// Collect the ultimate src of each of the mul node's operands, and confirm
15801// each operand is 8 bytes.
15802static std::optional<ByteProvider<SDValue>>
15803handleMulOperand(const SDValue &MulOperand) {
15804 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15805 if (!Byte0 || Byte0->isConstantZero()) {
15806 return std::nullopt;
15807 }
15808 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15809 if (Byte1 && !Byte1->isConstantZero()) {
15810 return std::nullopt;
15811 }
15812 return Byte0;
15813}
15814
15815static unsigned addPermMasks(unsigned First, unsigned Second) {
15816 unsigned FirstCs = First & 0x0c0c0c0c;
15817 unsigned SecondCs = Second & 0x0c0c0c0c;
15818 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15819 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15820
15821 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15822 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15823 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15824 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15825
15826 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15827}
15828
15829struct DotSrc {
15831 int64_t PermMask;
15833};
15834
15838 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15839
15840 assert(Src0.Src.has_value() && Src1.Src.has_value());
15841 // Src0s and Src1s are empty, just place arbitrarily.
15842 if (Step == 0) {
15843 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15844 Src0.SrcOffset / 4});
15845 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15846 Src1.SrcOffset / 4});
15847 return;
15848 }
15849
15850 for (int BPI = 0; BPI < 2; BPI++) {
15851 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15852 if (BPI == 1) {
15853 BPP = {Src1, Src0};
15854 }
15855 unsigned ZeroMask = 0x0c0c0c0c;
15856 unsigned FMask = 0xFF << (8 * (3 - Step));
15857
15858 unsigned FirstMask =
15859 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15860 unsigned SecondMask =
15861 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15862 // Attempt to find Src vector which contains our SDValue, if so, add our
15863 // perm mask to the existing one. If we are unable to find a match for the
15864 // first SDValue, attempt to find match for the second.
15865 int FirstGroup = -1;
15866 for (int I = 0; I < 2; I++) {
15867 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15868 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15869 return IterElt.SrcOp == *BPP.first.Src &&
15870 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15871 };
15872
15873 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15874 if (Match != Srcs.end()) {
15875 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15876 FirstGroup = I;
15877 break;
15878 }
15879 }
15880 if (FirstGroup != -1) {
15881 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15882 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15883 return IterElt.SrcOp == *BPP.second.Src &&
15884 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15885 };
15886 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15887 if (Match != Srcs.end()) {
15888 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15889 } else
15890 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15891 return;
15892 }
15893 }
15894
15895 // If we have made it here, then we could not find a match in Src0s or Src1s
15896 // for either Src0 or Src1, so just place them arbitrarily.
15897
15898 unsigned ZeroMask = 0x0c0c0c0c;
15899 unsigned FMask = 0xFF << (8 * (3 - Step));
15900
15901 Src0s.push_back(
15902 {*Src0.Src,
15903 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15904 Src0.SrcOffset / 4});
15905 Src1s.push_back(
15906 {*Src1.Src,
15907 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15908 Src1.SrcOffset / 4});
15909}
15910
15912 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15913 bool IsAny) {
15914
15915 // If we just have one source, just permute it accordingly.
15916 if (Srcs.size() == 1) {
15917 auto *Elt = Srcs.begin();
15918 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15919
15920 // v_perm will produce the original value
15921 if (Elt->PermMask == 0x3020100)
15922 return EltOp;
15923
15924 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15925 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15926 }
15927
15928 auto *FirstElt = Srcs.begin();
15929 auto *SecondElt = std::next(FirstElt);
15930
15932
15933 // If we have multiple sources in the chain, combine them via perms (using
15934 // calculated perm mask) and Ors.
15935 while (true) {
15936 auto FirstMask = FirstElt->PermMask;
15937 auto SecondMask = SecondElt->PermMask;
15938
15939 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15940 unsigned FirstPlusFour = FirstMask | 0x04040404;
15941 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15942 // original 0x0C.
15943 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15944
15945 auto PermMask = addPermMasks(FirstMask, SecondMask);
15946 auto FirstVal =
15947 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15948 auto SecondVal =
15949 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15950
15951 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15952 SecondVal,
15953 DAG.getConstant(PermMask, SL, MVT::i32)));
15954
15955 FirstElt = std::next(SecondElt);
15956 if (FirstElt == Srcs.end())
15957 break;
15958
15959 SecondElt = std::next(FirstElt);
15960 // If we only have a FirstElt, then just combine that into the cumulative
15961 // source node.
15962 if (SecondElt == Srcs.end()) {
15963 auto EltOp =
15964 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15965
15966 Perms.push_back(
15967 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15968 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15969 break;
15970 }
15971 }
15972
15973 assert(Perms.size() == 1 || Perms.size() == 2);
15974 return Perms.size() == 2
15975 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15976 : Perms[0];
15977}
15978
15979static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15980 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15981 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15982 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15983 EntryMask += ZeroMask;
15984 }
15985}
15986
15987static bool isMul(const SDValue Op) {
15988 auto Opcode = Op.getOpcode();
15989
15990 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15991 Opcode == AMDGPUISD::MUL_I24);
15992}
15993
15994static std::optional<bool>
15996 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15997 const SDValue &S1Op, const SelectionDAG &DAG) {
15998 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15999 // of the dot4 is irrelevant.
16000 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16001 return false;
16002
16003 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16004 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16005 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16006 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16007 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16008 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16009
16010 assert(!(S0IsUnsigned && S0IsSigned));
16011 assert(!(S1IsUnsigned && S1IsSigned));
16012
16013 // There are 9 possible permutations of
16014 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16015
16016 // In two permutations, the sign bits are known to be the same for both Ops,
16017 // so simply return Signed / Unsigned corresponding to the MSB
16018
16019 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16020 return S0IsSigned;
16021
16022 // In another two permutations, the sign bits are known to be opposite. In
16023 // this case return std::nullopt to indicate a bad match.
16024
16025 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16026 return std::nullopt;
16027
16028 // In the remaining five permutations, we don't know the value of the sign
16029 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16030 // the upper bits must be extension bits. Thus, the only ways for the sign
16031 // bit to be unknown is if it was sign extended from unknown value, or if it
16032 // was any extended. In either case, it is correct to use the signed
16033 // version of the signedness semantics of dot4
16034
16035 // In two of such permutations, we known the sign bit is set for
16036 // one op, and the other is unknown. It is okay to used signed version of
16037 // dot4.
16038 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16039 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16040 return true;
16041
16042 // In one such permutation, we don't know either of the sign bits. It is okay
16043 // to used the signed version of dot4.
16044 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16045 return true;
16046
16047 // In two of such permutations, we known the sign bit is unset for
16048 // one op, and the other is unknown. Return std::nullopt to indicate a
16049 // bad match.
16050 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16051 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16052 return std::nullopt;
16053
16054 llvm_unreachable("Fully covered condition");
16055}
16056
16057SDValue SITargetLowering::performAddCombine(SDNode *N,
16058 DAGCombinerInfo &DCI) const {
16059 SelectionDAG &DAG = DCI.DAG;
16060 EVT VT = N->getValueType(0);
16061 SDLoc SL(N);
16062 SDValue LHS = N->getOperand(0);
16063 SDValue RHS = N->getOperand(1);
16064
16065 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16066 if (Subtarget->hasMad64_32()) {
16067 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16068 return Folded;
16069 }
16070 }
16071
16072 if (SDValue V = reassociateScalarOps(N, DAG)) {
16073 return V;
16074 }
16075
16076 if (VT == MVT::i64) {
16077 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16078 return Folded;
16079 }
16080
16081 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16082 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16083 SDValue TempNode(N, 0);
16084 std::optional<bool> IsSigned;
16088
16089 // Match the v_dot4 tree, while collecting src nodes.
16090 int ChainLength = 0;
16091 for (int I = 0; I < 4; I++) {
16092 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16093 if (MulIdx == -1)
16094 break;
16095 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16096 if (!Src0)
16097 break;
16098 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16099 if (!Src1)
16100 break;
16101
16102 auto IterIsSigned = checkDot4MulSignedness(
16103 TempNode->getOperand(MulIdx), *Src0, *Src1,
16104 TempNode->getOperand(MulIdx)->getOperand(0),
16105 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16106 if (!IterIsSigned)
16107 break;
16108 if (!IsSigned)
16109 IsSigned = *IterIsSigned;
16110 if (*IterIsSigned != *IsSigned)
16111 break;
16112 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16113 auto AddIdx = 1 - MulIdx;
16114 // Allow the special case where add (add (mul24, 0), mul24) became ->
16115 // add (mul24, mul24).
16116 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16117 Src2s.push_back(TempNode->getOperand(AddIdx));
16118 auto Src0 =
16119 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16120 if (!Src0)
16121 break;
16122 auto Src1 =
16123 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16124 if (!Src1)
16125 break;
16126 auto IterIsSigned = checkDot4MulSignedness(
16127 TempNode->getOperand(AddIdx), *Src0, *Src1,
16128 TempNode->getOperand(AddIdx)->getOperand(0),
16129 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16130 if (!IterIsSigned)
16131 break;
16132 assert(IsSigned);
16133 if (*IterIsSigned != *IsSigned)
16134 break;
16135 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16136 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16137 ChainLength = I + 2;
16138 break;
16139 }
16140
16141 TempNode = TempNode->getOperand(AddIdx);
16142 Src2s.push_back(TempNode);
16143 ChainLength = I + 1;
16144 if (TempNode->getNumOperands() < 2)
16145 break;
16146 LHS = TempNode->getOperand(0);
16147 RHS = TempNode->getOperand(1);
16148 }
16149
16150 if (ChainLength < 2)
16151 return SDValue();
16152
16153 // Masks were constructed with assumption that we would find a chain of
16154 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16155 // 0x0c) so they do not affect dot calculation.
16156 if (ChainLength < 4) {
16157 fixMasks(Src0s, ChainLength);
16158 fixMasks(Src1s, ChainLength);
16159 }
16160
16161 SDValue Src0, Src1;
16162
16163 // If we are just using a single source for both, and have permuted the
16164 // bytes consistently, we can just use the sources without permuting
16165 // (commutation).
16166 bool UseOriginalSrc = false;
16167 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16168 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16169 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16170 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16171 SmallVector<unsigned, 4> SrcBytes;
16172 auto Src0Mask = Src0s.begin()->PermMask;
16173 SrcBytes.push_back(Src0Mask & 0xFF000000);
16174 bool UniqueEntries = true;
16175 for (auto I = 1; I < 4; I++) {
16176 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16177
16178 if (is_contained(SrcBytes, NextByte)) {
16179 UniqueEntries = false;
16180 break;
16181 }
16182 SrcBytes.push_back(NextByte);
16183 }
16184
16185 if (UniqueEntries) {
16186 UseOriginalSrc = true;
16187
16188 auto *FirstElt = Src0s.begin();
16189 auto FirstEltOp =
16190 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16191
16192 auto *SecondElt = Src1s.begin();
16193 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16194 SecondElt->DWordOffset);
16195
16196 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16197 MVT::getIntegerVT(32));
16198 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16199 MVT::getIntegerVT(32));
16200 }
16201 }
16202
16203 if (!UseOriginalSrc) {
16204 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16205 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16206 }
16207
16208 assert(IsSigned);
16209 SDValue Src2 =
16210 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16211
16212 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16213 : Intrinsic::amdgcn_udot4,
16214 SL, MVT::i64);
16215
16216 assert(!VT.isVector());
16217 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16218 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16219
16220 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16221 }
16222
16223 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16224 return SDValue();
16225
16226 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16227 // add x, sext (setcc) => usubo_carry x, 0, setcc
16228 unsigned Opc = LHS.getOpcode();
16231 std::swap(RHS, LHS);
16232
16233 Opc = RHS.getOpcode();
16234 switch (Opc) {
16235 default:
16236 break;
16237 case ISD::ZERO_EXTEND:
16238 case ISD::SIGN_EXTEND:
16239 case ISD::ANY_EXTEND: {
16240 auto Cond = RHS.getOperand(0);
16241 // If this won't be a real VOPC output, we would still need to insert an
16242 // extra instruction anyway.
16243 if (!isBoolSGPR(Cond))
16244 break;
16245 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16246 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16248 return DAG.getNode(Opc, SL, VTList, Args);
16249 }
16250 case ISD::UADDO_CARRY: {
16251 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16252 if (!isNullConstant(RHS.getOperand(1)))
16253 break;
16254 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16255 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16256 }
16257 }
16258 return SDValue();
16259}
16260
16261SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16262 DAGCombinerInfo &DCI) const {
16263 SelectionDAG &DAG = DCI.DAG;
16264 SDLoc DL(N);
16265 EVT VT = N->getValueType(0);
16266 SDValue N0 = N->getOperand(0);
16267 SDValue N1 = N->getOperand(1);
16268
16269 // The following folds transform PTRADDs into regular arithmetic in cases
16270 // where the PTRADD wouldn't be folded as an immediate offset into memory
16271 // instructions anyway. They are target-specific in that other targets might
16272 // prefer to not lose information about the pointer arithmetic.
16273
16274 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16275 // Adapted from DAGCombiner::visitADDLikeCommutative.
16276 SDValue V, K;
16277 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16278 SDNodeFlags ShlFlags = N1->getFlags();
16279 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16280 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16281 // preserved.
16282 SDNodeFlags NewShlFlags =
16283 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16285 : SDNodeFlags();
16286 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16287 DCI.AddToWorklist(Inner.getNode());
16288 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16289 }
16290
16291 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16292 // performAddCombine.
16293 if (N1.getOpcode() == ISD::MUL) {
16294 if (Subtarget->hasMad64_32()) {
16295 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16296 return Folded;
16297 }
16298 }
16299
16300 // If the 32 low bits of the constant are all zero, there is nothing to fold
16301 // into an immediate offset, so it's better to eliminate the unnecessary
16302 // addition for the lower 32 bits than to preserve the PTRADD.
16303 // Analogous to a fold in performAddCombine.
16304 if (VT == MVT::i64) {
16305 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16306 return Folded;
16307 }
16308
16309 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16310 return SDValue();
16311
16312 SDValue X = N0;
16313 SDValue Y = N1.getOperand(0);
16314 SDValue Z = N1.getOperand(1);
16315 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16316 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16317
16318 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16319 Y->isDivergent() != Z->isDivergent()) {
16320 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16321 // y are uniform and z isn't.
16322 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16323 // z are uniform and y isn't.
16324 // The goal is to push uniform operands up in the computation, so that they
16325 // can be handled with scalar operations. We can't use reassociateScalarOps
16326 // for this since it requires two identical commutative operations to
16327 // reassociate.
16328 if (Y->isDivergent())
16329 std::swap(Y, Z);
16330 // If both additions in the original were NUW, reassociation preserves that.
16331 SDNodeFlags ReassocFlags =
16332 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16333 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16334 DCI.AddToWorklist(UniformInner.getNode());
16335 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16336 }
16337
16338 return SDValue();
16339}
16340
16341SDValue SITargetLowering::performSubCombine(SDNode *N,
16342 DAGCombinerInfo &DCI) const {
16343 SelectionDAG &DAG = DCI.DAG;
16344 EVT VT = N->getValueType(0);
16345
16346 if (VT == MVT::i64) {
16347 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16348 return Folded;
16349 }
16350
16351 if (VT != MVT::i32)
16352 return SDValue();
16353
16354 SDLoc SL(N);
16355 SDValue LHS = N->getOperand(0);
16356 SDValue RHS = N->getOperand(1);
16357
16358 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16359 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16360 unsigned Opc = RHS.getOpcode();
16361 switch (Opc) {
16362 default:
16363 break;
16364 case ISD::ZERO_EXTEND:
16365 case ISD::SIGN_EXTEND:
16366 case ISD::ANY_EXTEND: {
16367 auto Cond = RHS.getOperand(0);
16368 // If this won't be a real VOPC output, we would still need to insert an
16369 // extra instruction anyway.
16370 if (!isBoolSGPR(Cond))
16371 break;
16372 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16373 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16375 return DAG.getNode(Opc, SL, VTList, Args);
16376 }
16377 }
16378
16379 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16380 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16381 if (!isNullConstant(LHS.getOperand(1)))
16382 return SDValue();
16383 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16384 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16385 }
16386 return SDValue();
16387}
16388
16389SDValue
16390SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16391 DAGCombinerInfo &DCI) const {
16392
16393 if (N->getValueType(0) != MVT::i32)
16394 return SDValue();
16395
16396 if (!isNullConstant(N->getOperand(1)))
16397 return SDValue();
16398
16399 SelectionDAG &DAG = DCI.DAG;
16400 SDValue LHS = N->getOperand(0);
16401
16402 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16403 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16404 unsigned LHSOpc = LHS.getOpcode();
16405 unsigned Opc = N->getOpcode();
16406 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16407 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16408 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16409 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16410 }
16411 return SDValue();
16412}
16413
16414SDValue SITargetLowering::performFAddCombine(SDNode *N,
16415 DAGCombinerInfo &DCI) const {
16416 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16417 return SDValue();
16418
16419 SelectionDAG &DAG = DCI.DAG;
16420 EVT VT = N->getValueType(0);
16421
16422 SDLoc SL(N);
16423 SDValue LHS = N->getOperand(0);
16424 SDValue RHS = N->getOperand(1);
16425
16426 // These should really be instruction patterns, but writing patterns with
16427 // source modifiers is a pain.
16428
16429 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16430 if (LHS.getOpcode() == ISD::FADD) {
16431 SDValue A = LHS.getOperand(0);
16432 if (A == LHS.getOperand(1)) {
16433 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16434 if (FusedOp != 0) {
16435 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16436 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16437 }
16438 }
16439 }
16440
16441 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16442 if (RHS.getOpcode() == ISD::FADD) {
16443 SDValue A = RHS.getOperand(0);
16444 if (A == RHS.getOperand(1)) {
16445 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16446 if (FusedOp != 0) {
16447 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16448 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16449 }
16450 }
16451 }
16452
16453 return SDValue();
16454}
16455
16456SDValue SITargetLowering::performFSubCombine(SDNode *N,
16457 DAGCombinerInfo &DCI) const {
16458 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16459 return SDValue();
16460
16461 SelectionDAG &DAG = DCI.DAG;
16462 SDLoc SL(N);
16463 EVT VT = N->getValueType(0);
16464 assert(!VT.isVector());
16465
16466 // Try to get the fneg to fold into the source modifier. This undoes generic
16467 // DAG combines and folds them into the mad.
16468 //
16469 // Only do this if we are not trying to support denormals. v_mad_f32 does
16470 // not support denormals ever.
16471 SDValue LHS = N->getOperand(0);
16472 SDValue RHS = N->getOperand(1);
16473 if (LHS.getOpcode() == ISD::FADD) {
16474 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16475 SDValue A = LHS.getOperand(0);
16476 if (A == LHS.getOperand(1)) {
16477 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16478 if (FusedOp != 0) {
16479 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16480 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16481
16482 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16483 }
16484 }
16485 }
16486
16487 if (RHS.getOpcode() == ISD::FADD) {
16488 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16489
16490 SDValue A = RHS.getOperand(0);
16491 if (A == RHS.getOperand(1)) {
16492 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16493 if (FusedOp != 0) {
16494 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16495 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16496 }
16497 }
16498 }
16499
16500 return SDValue();
16501}
16502
16503SDValue SITargetLowering::performFDivCombine(SDNode *N,
16504 DAGCombinerInfo &DCI) const {
16505 SelectionDAG &DAG = DCI.DAG;
16506 SDLoc SL(N);
16507 EVT VT = N->getValueType(0);
16508
16509 // fsqrt legality correlates to rsq availability.
16510 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
16511 return SDValue();
16512
16513 SDValue LHS = N->getOperand(0);
16514 SDValue RHS = N->getOperand(1);
16515
16516 SDNodeFlags Flags = N->getFlags();
16517 SDNodeFlags RHSFlags = RHS->getFlags();
16518 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16519 !RHS->hasOneUse())
16520 return SDValue();
16521
16522 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16523 bool IsNegative = false;
16524 if (CLHS->isExactlyValue(1.0) ||
16525 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16526 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16527 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16528 if (RHS.getOpcode() == ISD::FSQRT) {
16529 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16530 SDValue Rsq =
16531 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16532 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16533 }
16534 }
16535 }
16536
16537 return SDValue();
16538}
16539
16540SDValue SITargetLowering::performFMulCombine(SDNode *N,
16541 DAGCombinerInfo &DCI) const {
16542 SelectionDAG &DAG = DCI.DAG;
16543 EVT VT = N->getValueType(0);
16544 EVT ScalarVT = VT.getScalarType();
16545 EVT IntVT = VT.changeElementType(MVT::i32);
16546
16547 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16548 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16549 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16550 return SDValue();
16551 }
16552
16553 SDValue LHS = N->getOperand(0);
16554 SDValue RHS = N->getOperand(1);
16555
16556 // It is cheaper to realize i32 inline constants as compared against
16557 // materializing f16 or f64 (or even non-inline f32) values,
16558 // possible via ldexp usage, as shown below :
16559 //
16560 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16561 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16562 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16563 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16564 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16565 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16566 if (!TrueNode)
16567 return SDValue();
16568 const ConstantFPSDNode *FalseNode =
16569 isConstOrConstSplatFP(RHS.getOperand(2));
16570 if (!FalseNode)
16571 return SDValue();
16572
16573 if (TrueNode->isNegative() != FalseNode->isNegative())
16574 return SDValue();
16575
16576 // For f32, only non-inline constants should be transformed.
16577 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16578 if (ScalarVT == MVT::f32 &&
16579 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16580 TII->isInlineConstant(FalseNode->getValueAPF()))
16581 return SDValue();
16582
16583 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16584 if (TrueNodeExpVal == INT_MIN)
16585 return SDValue();
16586 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16587 if (FalseNodeExpVal == INT_MIN)
16588 return SDValue();
16589
16590 SDLoc SL(N);
16591 SDValue SelectNode =
16592 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16593 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16594 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16595
16596 LHS = TrueNode->isNegative()
16597 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16598 : LHS;
16599
16600 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16601 }
16602
16603 return SDValue();
16604}
16605
16606SDValue SITargetLowering::performFMACombine(SDNode *N,
16607 DAGCombinerInfo &DCI) const {
16608 SelectionDAG &DAG = DCI.DAG;
16609 EVT VT = N->getValueType(0);
16610 SDLoc SL(N);
16611
16612 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16613 return SDValue();
16614
16615 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16616 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16617 SDValue Op1 = N->getOperand(0);
16618 SDValue Op2 = N->getOperand(1);
16619 SDValue FMA = N->getOperand(2);
16620
16621 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16622 Op2.getOpcode() != ISD::FP_EXTEND)
16623 return SDValue();
16624
16625 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16626 // regardless of the denorm mode setting. Therefore,
16627 // fp-contract is sufficient to allow generating fdot2.
16628 const TargetOptions &Options = DAG.getTarget().Options;
16629 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16630 (N->getFlags().hasAllowContract() &&
16631 FMA->getFlags().hasAllowContract())) {
16632 Op1 = Op1.getOperand(0);
16633 Op2 = Op2.getOperand(0);
16634 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16636 return SDValue();
16637
16638 SDValue Vec1 = Op1.getOperand(0);
16639 SDValue Idx1 = Op1.getOperand(1);
16640 SDValue Vec2 = Op2.getOperand(0);
16641
16642 SDValue FMAOp1 = FMA.getOperand(0);
16643 SDValue FMAOp2 = FMA.getOperand(1);
16644 SDValue FMAAcc = FMA.getOperand(2);
16645
16646 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16647 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16648 return SDValue();
16649
16650 FMAOp1 = FMAOp1.getOperand(0);
16651 FMAOp2 = FMAOp2.getOperand(0);
16652 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16654 return SDValue();
16655
16656 SDValue Vec3 = FMAOp1.getOperand(0);
16657 SDValue Vec4 = FMAOp2.getOperand(0);
16658 SDValue Idx2 = FMAOp1.getOperand(1);
16659
16660 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16661 // Idx1 and Idx2 cannot be the same.
16662 Idx1 == Idx2)
16663 return SDValue();
16664
16665 if (Vec1 == Vec2 || Vec3 == Vec4)
16666 return SDValue();
16667
16668 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16669 return SDValue();
16670
16671 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16672 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16673 DAG.getTargetConstant(0, SL, MVT::i1));
16674 }
16675 }
16676 return SDValue();
16677}
16678
16679SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16680 DAGCombinerInfo &DCI) const {
16681 SelectionDAG &DAG = DCI.DAG;
16682 SDLoc SL(N);
16683
16684 SDValue LHS = N->getOperand(0);
16685 SDValue RHS = N->getOperand(1);
16686 EVT VT = LHS.getValueType();
16687 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16688
16689 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16690 if (!CRHS) {
16692 if (CRHS) {
16693 std::swap(LHS, RHS);
16694 CC = getSetCCSwappedOperands(CC);
16695 }
16696 }
16697
16698 if (CRHS) {
16699 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16700 isBoolSGPR(LHS.getOperand(0))) {
16701 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16702 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16703 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16704 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16705 if ((CRHS->isAllOnes() &&
16706 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16707 (CRHS->isZero() &&
16708 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16709 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16710 DAG.getAllOnesConstant(SL, MVT::i1));
16711 if ((CRHS->isAllOnes() &&
16712 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16713 (CRHS->isZero() &&
16714 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16715 return LHS.getOperand(0);
16716 }
16717
16718 const APInt &CRHSVal = CRHS->getAPIntValue();
16719 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16720 LHS.getOpcode() == ISD::SELECT &&
16721 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16722 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16723 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16724 isBoolSGPR(LHS.getOperand(0))) {
16725 // Given CT != FT:
16726 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16727 // setcc (select cc, CT, CF), CF, ne => cc
16728 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16729 // setcc (select cc, CT, CF), CT, eq => cc
16730 const APInt &CT = LHS.getConstantOperandAPInt(1);
16731 const APInt &CF = LHS.getConstantOperandAPInt(2);
16732
16733 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16734 (CT == CRHSVal && CC == ISD::SETNE))
16735 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16736 DAG.getAllOnesConstant(SL, MVT::i1));
16737 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16738 (CT == CRHSVal && CC == ISD::SETEQ))
16739 return LHS.getOperand(0);
16740 }
16741 }
16742
16743 // Eliminate setcc by using carryout from add/sub instruction
16744
16745 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
16746 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
16747 // similarly for subtraction
16748
16749 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
16750 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
16751
16752 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16754 (CC == ISD::SETUGT &&
16756 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16757 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16758 bool IsAdd = LHS.getOpcode() == ISD::ADD;
16759
16760 SDValue Op0 = LHS.getOperand(0);
16761 SDValue Op1 = LHS.getOperand(1);
16762
16763 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
16764 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
16765
16766 SDValue Op0Hi = getHiHalf64(Op0, DAG);
16767 SDValue Op1Hi = getHiHalf64(Op1, DAG);
16768
16769 SDValue NodeLo =
16770 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16771 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16772
16773 SDValue CarryInHi = NodeLo.getValue(1);
16774 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16775 SL, DAG.getVTList(MVT::i32, MVT::i1),
16776 {Op0Hi, Op1Hi, CarryInHi});
16777
16778 SDValue ResultLo = NodeLo.getValue(0);
16779 SDValue ResultHi = NodeHi.getValue(0);
16780
16781 SDValue JoinedResult =
16782 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
16783
16784 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16785 SDValue Overflow = NodeHi.getValue(1);
16786 DCI.CombineTo(LHS.getNode(), Result);
16787 return Overflow;
16788 }
16789
16790 if (VT != MVT::f32 && VT != MVT::f64 &&
16791 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16792 return SDValue();
16793
16794 // Match isinf/isfinite pattern
16795 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16796 // (fcmp one (fabs x), inf) -> (fp_class x,
16797 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16798 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16799 LHS.getOpcode() == ISD::FABS) {
16800 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16801 if (!CRHS)
16802 return SDValue();
16803
16804 const APFloat &APF = CRHS->getValueAPF();
16805 if (APF.isInfinity() && !APF.isNegative()) {
16806 const unsigned IsInfMask =
16808 const unsigned IsFiniteMask =
16812 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16813 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16814 DAG.getConstant(Mask, SL, MVT::i32));
16815 }
16816 }
16817
16818 return SDValue();
16819}
16820
16821SDValue
16822SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16823 DAGCombinerInfo &DCI) const {
16824 SelectionDAG &DAG = DCI.DAG;
16825 SDLoc SL(N);
16826 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16827
16828 SDValue Src = N->getOperand(0);
16829 SDValue Shift = N->getOperand(0);
16830
16831 // TODO: Extend type shouldn't matter (assuming legal types).
16832 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16833 Shift = Shift.getOperand(0);
16834
16835 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16836 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16837 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16838 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16839 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16840 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16841 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16842 SDValue Shifted = DAG.getZExtOrTrunc(
16843 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16844
16845 unsigned ShiftOffset = 8 * Offset;
16846 if (Shift.getOpcode() == ISD::SHL)
16847 ShiftOffset -= C->getZExtValue();
16848 else
16849 ShiftOffset += C->getZExtValue();
16850
16851 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16852 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16853 MVT::f32, Shifted);
16854 }
16855 }
16856 }
16857
16858 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16859 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16860 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16861 // We simplified Src. If this node is not dead, visit it again so it is
16862 // folded properly.
16863 if (N->getOpcode() != ISD::DELETED_NODE)
16864 DCI.AddToWorklist(N);
16865 return SDValue(N, 0);
16866 }
16867
16868 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16869 if (SDValue DemandedSrc =
16870 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16871 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16872
16873 return SDValue();
16874}
16875
16876SDValue SITargetLowering::performClampCombine(SDNode *N,
16877 DAGCombinerInfo &DCI) const {
16878 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16879 if (!CSrc)
16880 return SDValue();
16881
16882 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16883 const APFloat &F = CSrc->getValueAPF();
16884 APFloat Zero = APFloat::getZero(F.getSemantics());
16885 if (F < Zero ||
16886 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16887 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16888 }
16889
16890 APFloat One(F.getSemantics(), "1.0");
16891 if (F > One)
16892 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16893
16894 return SDValue(CSrc, 0);
16895}
16896
16897SDValue SITargetLowering::performSelectCombine(SDNode *N,
16898 DAGCombinerInfo &DCI) const {
16899
16900 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16901 // integer).
16902 // Detect when CMP and SELECT use the same constant and fold them to avoid
16903 // loading the constant twice. Specifically handles patterns like:
16904 // %cmp = icmp eq i32 %val, 4242
16905 // %sel = select i1 %cmp, i32 4242, i32 %other
16906 // It can be optimized to reuse %val instead of 4242 in select.
16907 SDValue Cond = N->getOperand(0);
16908 SDValue TrueVal = N->getOperand(1);
16909 SDValue FalseVal = N->getOperand(2);
16910
16911 // Check if condition is a comparison.
16912 if (Cond.getOpcode() != ISD::SETCC)
16913 return SDValue();
16914
16915 SDValue LHS = Cond.getOperand(0);
16916 SDValue RHS = Cond.getOperand(1);
16917 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16918
16919 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16920 bool isInteger = LHS.getValueType().isInteger();
16921
16922 // Handle simple floating-point and integer types only.
16923 if (!isFloatingPoint && !isInteger)
16924 return SDValue();
16925
16926 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16927 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16928 if (!isEquality && !isNonEquality)
16929 return SDValue();
16930
16931 SDValue ArgVal, ConstVal;
16932 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16933 (isInteger && isa<ConstantSDNode>(RHS))) {
16934 ConstVal = RHS;
16935 ArgVal = LHS;
16936 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16937 (isInteger && isa<ConstantSDNode>(LHS))) {
16938 ConstVal = LHS;
16939 ArgVal = RHS;
16940 } else {
16941 return SDValue();
16942 }
16943
16944 // Skip optimization for inlinable immediates.
16945 if (isFloatingPoint) {
16946 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16947 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16948 return SDValue();
16949 } else {
16951 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16952 return SDValue();
16953 }
16954
16955 // For equality and non-equality comparisons, patterns:
16956 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16957 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16958 if (!(isEquality && TrueVal == ConstVal) &&
16959 !(isNonEquality && FalseVal == ConstVal))
16960 return SDValue();
16961
16962 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16963 SDValue SelectRHS =
16964 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16965 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16966 SelectLHS, SelectRHS);
16967}
16968
16970 DAGCombinerInfo &DCI) const {
16971 switch (N->getOpcode()) {
16972 case ISD::ADD:
16973 case ISD::SUB:
16974 case ISD::SHL:
16975 case ISD::SRL:
16976 case ISD::SRA:
16977 case ISD::AND:
16978 case ISD::OR:
16979 case ISD::XOR:
16980 case ISD::MUL:
16981 case ISD::SETCC:
16982 case ISD::SELECT:
16983 case ISD::SMIN:
16984 case ISD::SMAX:
16985 case ISD::UMIN:
16986 case ISD::UMAX:
16987 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16988 return Res;
16989 break;
16990 default:
16991 break;
16992 }
16993
16994 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16995 return SDValue();
16996
16997 switch (N->getOpcode()) {
16998 case ISD::ADD:
16999 return performAddCombine(N, DCI);
17000 case ISD::PTRADD:
17001 return performPtrAddCombine(N, DCI);
17002 case ISD::SUB:
17003 return performSubCombine(N, DCI);
17004 case ISD::UADDO_CARRY:
17005 case ISD::USUBO_CARRY:
17006 return performAddCarrySubCarryCombine(N, DCI);
17007 case ISD::FADD:
17008 return performFAddCombine(N, DCI);
17009 case ISD::FSUB:
17010 return performFSubCombine(N, DCI);
17011 case ISD::FDIV:
17012 return performFDivCombine(N, DCI);
17013 case ISD::FMUL:
17014 return performFMulCombine(N, DCI);
17015 case ISD::SETCC:
17016 return performSetCCCombine(N, DCI);
17017 case ISD::SELECT:
17018 if (auto Res = performSelectCombine(N, DCI))
17019 return Res;
17020 break;
17021 case ISD::FMAXNUM:
17022 case ISD::FMINNUM:
17023 case ISD::FMAXNUM_IEEE:
17024 case ISD::FMINNUM_IEEE:
17025 case ISD::FMAXIMUM:
17026 case ISD::FMINIMUM:
17027 case ISD::FMAXIMUMNUM:
17028 case ISD::FMINIMUMNUM:
17029 case ISD::SMAX:
17030 case ISD::SMIN:
17031 case ISD::UMAX:
17032 case ISD::UMIN:
17033 case AMDGPUISD::FMIN_LEGACY:
17034 case AMDGPUISD::FMAX_LEGACY:
17035 return performMinMaxCombine(N, DCI);
17036 case ISD::FMA:
17037 return performFMACombine(N, DCI);
17038 case ISD::AND:
17039 return performAndCombine(N, DCI);
17040 case ISD::OR:
17041 return performOrCombine(N, DCI);
17042 case ISD::FSHR: {
17044 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
17045 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17046 return matchPERM(N, DCI);
17047 }
17048 break;
17049 }
17050 case ISD::XOR:
17051 return performXorCombine(N, DCI);
17052 case ISD::ZERO_EXTEND:
17053 return performZeroExtendCombine(N, DCI);
17055 return performSignExtendInRegCombine(N, DCI);
17056 case AMDGPUISD::FP_CLASS:
17057 return performClassCombine(N, DCI);
17058 case ISD::FCANONICALIZE:
17059 return performFCanonicalizeCombine(N, DCI);
17060 case AMDGPUISD::RCP:
17061 return performRcpCombine(N, DCI);
17062 case ISD::FLDEXP:
17063 case AMDGPUISD::FRACT:
17064 case AMDGPUISD::RSQ:
17065 case AMDGPUISD::RCP_LEGACY:
17066 case AMDGPUISD::RCP_IFLAG:
17067 case AMDGPUISD::RSQ_CLAMP: {
17068 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17069 SDValue Src = N->getOperand(0);
17070 if (Src.isUndef())
17071 return Src;
17072 break;
17073 }
17074 case ISD::SINT_TO_FP:
17075 case ISD::UINT_TO_FP:
17076 return performUCharToFloatCombine(N, DCI);
17077 case ISD::FCOPYSIGN:
17078 return performFCopySignCombine(N, DCI);
17079 case AMDGPUISD::CVT_F32_UBYTE0:
17080 case AMDGPUISD::CVT_F32_UBYTE1:
17081 case AMDGPUISD::CVT_F32_UBYTE2:
17082 case AMDGPUISD::CVT_F32_UBYTE3:
17083 return performCvtF32UByteNCombine(N, DCI);
17084 case AMDGPUISD::FMED3:
17085 return performFMed3Combine(N, DCI);
17086 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17087 return performCvtPkRTZCombine(N, DCI);
17088 case AMDGPUISD::CLAMP:
17089 return performClampCombine(N, DCI);
17090 case ISD::SCALAR_TO_VECTOR: {
17091 SelectionDAG &DAG = DCI.DAG;
17092 EVT VT = N->getValueType(0);
17093
17094 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17095 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17096 SDLoc SL(N);
17097 SDValue Src = N->getOperand(0);
17098 EVT EltVT = Src.getValueType();
17099 if (EltVT != MVT::i16)
17100 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17101
17102 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17103 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17104 }
17105
17106 break;
17107 }
17109 return performExtractVectorEltCombine(N, DCI);
17111 return performInsertVectorEltCombine(N, DCI);
17112 case ISD::FP_ROUND:
17113 return performFPRoundCombine(N, DCI);
17114 case ISD::LOAD: {
17115 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17116 return Widened;
17117 [[fallthrough]];
17118 }
17119 default: {
17120 if (!DCI.isBeforeLegalize()) {
17121 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17122 return performMemSDNodeCombine(MemNode, DCI);
17123 }
17124
17125 break;
17126 }
17127 }
17128
17130}
17131
17132/// Helper function for adjustWritemask
17133static unsigned SubIdx2Lane(unsigned Idx) {
17134 switch (Idx) {
17135 default:
17136 return ~0u;
17137 case AMDGPU::sub0:
17138 return 0;
17139 case AMDGPU::sub1:
17140 return 1;
17141 case AMDGPU::sub2:
17142 return 2;
17143 case AMDGPU::sub3:
17144 return 3;
17145 case AMDGPU::sub4:
17146 return 4; // Possible with TFE/LWE
17147 }
17148}
17149
17150/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17151SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17152 SelectionDAG &DAG) const {
17153 unsigned Opcode = Node->getMachineOpcode();
17154
17155 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17156 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17157 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17158 return Node; // not implemented for D16
17159
17160 SDNode *Users[5] = {nullptr};
17161 unsigned Lane = 0;
17162 unsigned DmaskIdx =
17163 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17164 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17165 unsigned NewDmask = 0;
17166 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17167 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17168 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17169 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17170 unsigned TFCLane = 0;
17171 bool HasChain = Node->getNumValues() > 1;
17172
17173 if (OldDmask == 0) {
17174 // These are folded out, but on the chance it happens don't assert.
17175 return Node;
17176 }
17177
17178 unsigned OldBitsSet = llvm::popcount(OldDmask);
17179 // Work out which is the TFE/LWE lane if that is enabled.
17180 if (UsesTFC) {
17181 TFCLane = OldBitsSet;
17182 }
17183
17184 // Try to figure out the used register components
17185 for (SDUse &Use : Node->uses()) {
17186
17187 // Don't look at users of the chain.
17188 if (Use.getResNo() != 0)
17189 continue;
17190
17191 SDNode *User = Use.getUser();
17192
17193 // Abort if we can't understand the usage
17194 if (!User->isMachineOpcode() ||
17195 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17196 return Node;
17197
17198 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17199 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17200 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17201 // set, etc.
17202 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17203 if (Lane == ~0u)
17204 return Node;
17205
17206 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17207 if (UsesTFC && Lane == TFCLane) {
17208 Users[Lane] = User;
17209 } else {
17210 // Set which texture component corresponds to the lane.
17211 unsigned Comp;
17212 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17213 Comp = llvm::countr_zero(Dmask);
17214 Dmask &= ~(1 << Comp);
17215 }
17216
17217 // Abort if we have more than one user per component.
17218 if (Users[Lane])
17219 return Node;
17220
17221 Users[Lane] = User;
17222 NewDmask |= 1 << Comp;
17223 }
17224 }
17225
17226 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17227 bool NoChannels = !NewDmask;
17228 if (NoChannels) {
17229 if (!UsesTFC) {
17230 // No uses of the result and not using TFC. Then do nothing.
17231 return Node;
17232 }
17233 // If the original dmask has one channel - then nothing to do
17234 if (OldBitsSet == 1)
17235 return Node;
17236 // Use an arbitrary dmask - required for the instruction to work
17237 NewDmask = 1;
17238 }
17239 // Abort if there's no change
17240 if (NewDmask == OldDmask)
17241 return Node;
17242
17243 unsigned BitsSet = llvm::popcount(NewDmask);
17244
17245 // Check for TFE or LWE - increase the number of channels by one to account
17246 // for the extra return value
17247 // This will need adjustment for D16 if this is also included in
17248 // adjustWriteMask (this function) but at present D16 are excluded.
17249 unsigned NewChannels = BitsSet + UsesTFC;
17250
17251 int NewOpcode =
17252 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17253 assert(NewOpcode != -1 &&
17254 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17255 "failed to find equivalent MIMG op");
17256
17257 // Adjust the writemask in the node
17259 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17260 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17261 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17262
17263 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17264
17265 MVT ResultVT = NewChannels == 1
17266 ? SVT
17267 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17268 : NewChannels == 5 ? 8
17269 : NewChannels);
17270 SDVTList NewVTList =
17271 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17272
17273 MachineSDNode *NewNode =
17274 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17275
17276 if (HasChain) {
17277 // Update chain.
17278 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17279 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17280 }
17281
17282 if (NewChannels == 1) {
17283 assert(Node->hasNUsesOfValue(1, 0));
17284 SDNode *Copy =
17285 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17286 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17287 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17288 return nullptr;
17289 }
17290
17291 // Update the users of the node with the new indices
17292 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17293 SDNode *User = Users[i];
17294 if (!User) {
17295 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17296 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17297 if (i || !NoChannels)
17298 continue;
17299 } else {
17300 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17301 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17302 if (NewUser != User) {
17303 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17304 DAG.RemoveDeadNode(User);
17305 }
17306 }
17307
17308 switch (Idx) {
17309 default:
17310 break;
17311 case AMDGPU::sub0:
17312 Idx = AMDGPU::sub1;
17313 break;
17314 case AMDGPU::sub1:
17315 Idx = AMDGPU::sub2;
17316 break;
17317 case AMDGPU::sub2:
17318 Idx = AMDGPU::sub3;
17319 break;
17320 case AMDGPU::sub3:
17321 Idx = AMDGPU::sub4;
17322 break;
17323 }
17324 }
17325
17326 DAG.RemoveDeadNode(Node);
17327 return nullptr;
17328}
17329
17331 if (Op.getOpcode() == ISD::AssertZext)
17332 Op = Op.getOperand(0);
17333
17334 return isa<FrameIndexSDNode>(Op);
17335}
17336
17337/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17338/// with frame index operands.
17339/// LLVM assumes that inputs are to these instructions are registers.
17340SDNode *
17342 SelectionDAG &DAG) const {
17343 if (Node->getOpcode() == ISD::CopyToReg) {
17344 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17345 SDValue SrcVal = Node->getOperand(2);
17346
17347 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17348 // to try understanding copies to physical registers.
17349 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17350 SDLoc SL(Node);
17352 SDValue VReg = DAG.getRegister(
17353 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17354
17355 SDNode *Glued = Node->getGluedNode();
17356 SDValue ToVReg = DAG.getCopyToReg(
17357 Node->getOperand(0), SL, VReg, SrcVal,
17358 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17359 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17360 VReg, ToVReg.getValue(1));
17361 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17362 DAG.RemoveDeadNode(Node);
17363 return ToResultReg.getNode();
17364 }
17365 }
17366
17368 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17369 if (!isFrameIndexOp(Node->getOperand(i))) {
17370 Ops.push_back(Node->getOperand(i));
17371 continue;
17372 }
17373
17374 SDLoc DL(Node);
17375 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17376 Node->getOperand(i).getValueType(),
17377 Node->getOperand(i)),
17378 0));
17379 }
17380
17381 return DAG.UpdateNodeOperands(Node, Ops);
17382}
17383
17384/// Fold the instructions after selecting them.
17385/// Returns null if users were already updated.
17387 SelectionDAG &DAG) const {
17389 unsigned Opcode = Node->getMachineOpcode();
17390
17391 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17392 !TII->isGather4(Opcode) &&
17393 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17394 return adjustWritemask(Node, DAG);
17395 }
17396
17397 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17399 return Node;
17400 }
17401
17402 switch (Opcode) {
17403 case AMDGPU::V_DIV_SCALE_F32_e64:
17404 case AMDGPU::V_DIV_SCALE_F64_e64: {
17405 // Satisfy the operand register constraint when one of the inputs is
17406 // undefined. Ordinarily each undef value will have its own implicit_def of
17407 // a vreg, so force these to use a single register.
17408 SDValue Src0 = Node->getOperand(1);
17409 SDValue Src1 = Node->getOperand(3);
17410 SDValue Src2 = Node->getOperand(5);
17411
17412 if ((Src0.isMachineOpcode() &&
17413 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17414 (Src0 == Src1 || Src0 == Src2))
17415 break;
17416
17417 MVT VT = Src0.getValueType().getSimpleVT();
17418 const TargetRegisterClass *RC =
17419 getRegClassFor(VT, Src0.getNode()->isDivergent());
17420
17422 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17423
17424 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17425 Src0, SDValue());
17426
17427 // src0 must be the same register as src1 or src2, even if the value is
17428 // undefined, so make sure we don't violate this constraint.
17429 if (Src0.isMachineOpcode() &&
17430 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17431 if (Src1.isMachineOpcode() &&
17432 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17433 Src0 = Src1;
17434 else if (Src2.isMachineOpcode() &&
17435 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17436 Src0 = Src2;
17437 else {
17438 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17439 Src0 = UndefReg;
17440 Src1 = UndefReg;
17441 }
17442 } else
17443 break;
17444
17446 Ops[1] = Src0;
17447 Ops[3] = Src1;
17448 Ops[5] = Src2;
17449 Ops.push_back(ImpDef.getValue(1));
17450 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17451 }
17452 default:
17453 break;
17454 }
17455
17456 return Node;
17457}
17458
17459// Any MIMG instructions that use tfe or lwe require an initialization of the
17460// result register that will be written in the case of a memory access failure.
17461// The required code is also added to tie this init code to the result of the
17462// img instruction.
17465 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17466 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17467 MachineBasicBlock &MBB = *MI.getParent();
17468
17469 int DstIdx =
17470 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17471 unsigned InitIdx = 0;
17472
17473 if (TII->isImage(MI)) {
17474 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17475 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17476 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17477
17478 if (!TFE && !LWE) // intersect_ray
17479 return;
17480
17481 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17482 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17483 unsigned D16Val = D16 ? D16->getImm() : 0;
17484
17485 if (!TFEVal && !LWEVal)
17486 return;
17487
17488 // At least one of TFE or LWE are non-zero
17489 // We have to insert a suitable initialization of the result value and
17490 // tie this to the dest of the image instruction.
17491
17492 // Calculate which dword we have to initialize to 0.
17493 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17494
17495 // check that dmask operand is found.
17496 assert(MO_Dmask && "Expected dmask operand in instruction");
17497
17498 unsigned dmask = MO_Dmask->getImm();
17499 // Determine the number of active lanes taking into account the
17500 // Gather4 special case
17501 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17502
17503 bool Packed = !Subtarget->hasUnpackedD16VMem();
17504
17505 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17506
17507 // Abandon attempt if the dst size isn't large enough
17508 // - this is in fact an error but this is picked up elsewhere and
17509 // reported correctly.
17510 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17511
17512 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
17513 if (DstSize < InitIdx)
17514 return;
17515 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17516 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17517 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
17518 } else {
17519 return;
17520 }
17521
17522 const DebugLoc &DL = MI.getDebugLoc();
17523
17524 // Create a register for the initialization value.
17525 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17526 unsigned NewDst = 0; // Final initialized value will be in here
17527
17528 // If PRTStrictNull feature is enabled (the default) then initialize
17529 // all the result registers to 0, otherwise just the error indication
17530 // register (VGPRn+1)
17531 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17532 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17533
17534 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17535 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17536 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17537 // Initialize dword
17538 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17539 // clang-format off
17540 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17541 .addImm(0);
17542 // clang-format on
17543 // Insert into the super-reg
17544 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17545 .addReg(PrevDst)
17546 .addReg(SubReg)
17548
17549 PrevDst = NewDst;
17550 }
17551
17552 // Add as an implicit operand
17553 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17554
17555 // Tie the just added implicit operand to the dst
17556 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17557}
17558
17559/// Assign the register class depending on the number of
17560/// bits set in the writemask
17562 SDNode *Node) const {
17564
17565 MachineFunction *MF = MI.getMF();
17567
17568 if (TII->isVOP3(MI.getOpcode())) {
17569 // Make sure constant bus requirements are respected.
17570 TII->legalizeOperandsVOP3(MRI, MI);
17571
17572 if (TII->isMAI(MI)) {
17573 // The ordinary src0, src1, src2 were legalized above.
17574 //
17575 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17576 // as a separate instruction.
17577 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17578 AMDGPU::OpName::scale_src0);
17579 if (Src0Idx != -1) {
17580 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17581 AMDGPU::OpName::scale_src1);
17582 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17583 TII->usesConstantBus(MRI, MI, Src1Idx))
17584 TII->legalizeOpWithMove(MI, Src1Idx);
17585 }
17586 }
17587
17588 return;
17589 }
17590
17591 if (TII->isImage(MI))
17592 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17593}
17594
17596 uint64_t Val) {
17597 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17598 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17599}
17600
17602 const SDLoc &DL,
17603 SDValue Ptr) const {
17605
17606 // Build the half of the subregister with the constants before building the
17607 // full 128-bit register. If we are building multiple resource descriptors,
17608 // this will allow CSEing of the 2-component register.
17609 const SDValue Ops0[] = {
17610 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17611 buildSMovImm32(DAG, DL, 0),
17612 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17613 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17614 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17615
17616 SDValue SubRegHi = SDValue(
17617 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17618
17619 // Combine the constants and the pointer.
17620 const SDValue Ops1[] = {
17621 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17622 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17623 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17624
17625 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17626}
17627
17628/// Return a resource descriptor with the 'Add TID' bit enabled
17629/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17630/// of the resource descriptor) to create an offset, which is added to
17631/// the resource pointer.
17633 SDValue Ptr, uint32_t RsrcDword1,
17634 uint64_t RsrcDword2And3) const {
17635 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17636 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17637 if (RsrcDword1) {
17638 PtrHi =
17639 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17640 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17641 0);
17642 }
17643
17644 SDValue DataLo =
17645 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17646 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17647
17648 const SDValue Ops[] = {
17649 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17650 PtrLo,
17651 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17652 PtrHi,
17653 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17654 DataLo,
17655 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17656 DataHi,
17657 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17658
17659 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17660}
17661
17662//===----------------------------------------------------------------------===//
17663// SI Inline Assembly Support
17664//===----------------------------------------------------------------------===//
17665
17666std::pair<unsigned, const TargetRegisterClass *>
17668 StringRef Constraint,
17669 MVT VT) const {
17670 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17671
17672 const TargetRegisterClass *RC = nullptr;
17673 if (Constraint.size() == 1) {
17674 // Check if we cannot determine the bit size of the given value type. This
17675 // can happen, for example, in this situation where we have an empty struct
17676 // (size 0): `call void asm "", "v"({} poison)`-
17677 if (VT == MVT::Other)
17678 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17679 const unsigned BitWidth = VT.getSizeInBits();
17680 switch (Constraint[0]) {
17681 default:
17682 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17683 case 's':
17684 case 'r':
17685 switch (BitWidth) {
17686 case 16:
17687 RC = &AMDGPU::SReg_32RegClass;
17688 break;
17689 case 64:
17690 RC = &AMDGPU::SGPR_64RegClass;
17691 break;
17692 default:
17694 if (!RC)
17695 return std::pair(0U, nullptr);
17696 break;
17697 }
17698 break;
17699 case 'v':
17700 switch (BitWidth) {
17701 case 1:
17702 return std::pair(0U, nullptr);
17703 case 16:
17704 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17705 : &AMDGPU::VGPR_32_Lo256RegClass;
17706 break;
17707 default:
17708 RC = Subtarget->has1024AddressableVGPRs()
17709 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17710 : TRI->getVGPRClassForBitWidth(BitWidth);
17711 if (!RC)
17712 return std::pair(0U, nullptr);
17713 break;
17714 }
17715 break;
17716 case 'a':
17717 if (!Subtarget->hasMAIInsts())
17718 break;
17719 switch (BitWidth) {
17720 case 1:
17721 return std::pair(0U, nullptr);
17722 case 16:
17723 RC = &AMDGPU::AGPR_32RegClass;
17724 break;
17725 default:
17726 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17727 if (!RC)
17728 return std::pair(0U, nullptr);
17729 break;
17730 }
17731 break;
17732 }
17733 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17734 const unsigned BitWidth = VT.getSizeInBits();
17735 switch (BitWidth) {
17736 case 16:
17737 RC = &AMDGPU::AV_32RegClass;
17738 break;
17739 default:
17740 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17741 if (!RC)
17742 return std::pair(0U, nullptr);
17743 break;
17744 }
17745 }
17746
17747 // We actually support i128, i16 and f16 as inline parameters
17748 // even if they are not reported as legal
17749 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17750 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17751 return std::pair(0U, RC);
17752
17753 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17754 if (Kind != '\0') {
17755 if (Kind == 'v') {
17756 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17757 } else if (Kind == 's') {
17758 RC = &AMDGPU::SGPR_32RegClass;
17759 } else if (Kind == 'a') {
17760 RC = &AMDGPU::AGPR_32RegClass;
17761 }
17762
17763 if (RC) {
17764 if (NumRegs > 1) {
17765 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17766 return std::pair(0U, nullptr);
17767
17768 uint32_t Width = NumRegs * 32;
17769 // Prohibit constraints for register ranges with a width that does not
17770 // match the required type.
17771 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17772 return std::pair(0U, nullptr);
17773
17774 MCRegister Reg = RC->getRegister(Idx);
17776 RC = TRI->getVGPRClassForBitWidth(Width);
17777 else if (SIRegisterInfo::isSGPRClass(RC))
17778 RC = TRI->getSGPRClassForBitWidth(Width);
17779 else if (SIRegisterInfo::isAGPRClass(RC))
17780 RC = TRI->getAGPRClassForBitWidth(Width);
17781 if (RC) {
17782 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17783 if (!Reg) {
17784 // The register class does not contain the requested register,
17785 // e.g., because it is an SGPR pair that would violate alignment
17786 // requirements.
17787 return std::pair(0U, nullptr);
17788 }
17789 return std::pair(Reg, RC);
17790 }
17791 }
17792
17793 // Check for lossy scalar/vector conversions.
17794 if (VT.isVector() && VT.getSizeInBits() != 32)
17795 return std::pair(0U, nullptr);
17796 if (Idx < RC->getNumRegs())
17797 return std::pair(RC->getRegister(Idx), RC);
17798 return std::pair(0U, nullptr);
17799 }
17800 }
17801
17802 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17803 if (Ret.first)
17804 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17805
17806 return Ret;
17807}
17808
17809static bool isImmConstraint(StringRef Constraint) {
17810 if (Constraint.size() == 1) {
17811 switch (Constraint[0]) {
17812 default:
17813 break;
17814 case 'I':
17815 case 'J':
17816 case 'A':
17817 case 'B':
17818 case 'C':
17819 return true;
17820 }
17821 } else if (Constraint == "DA" || Constraint == "DB") {
17822 return true;
17823 }
17824 return false;
17825}
17826
17829 if (Constraint.size() == 1) {
17830 switch (Constraint[0]) {
17831 default:
17832 break;
17833 case 's':
17834 case 'v':
17835 case 'a':
17836 return C_RegisterClass;
17837 }
17838 } else if (Constraint.size() == 2) {
17839 if (Constraint == "VA")
17840 return C_RegisterClass;
17841 }
17842 if (isImmConstraint(Constraint)) {
17843 return C_Other;
17844 }
17845 return TargetLowering::getConstraintType(Constraint);
17846}
17847
17848static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17850 Val = Val & maskTrailingOnes<uint64_t>(Size);
17851 }
17852 return Val;
17853}
17854
17856 StringRef Constraint,
17857 std::vector<SDValue> &Ops,
17858 SelectionDAG &DAG) const {
17859 if (isImmConstraint(Constraint)) {
17860 uint64_t Val;
17861 if (getAsmOperandConstVal(Op, Val) &&
17862 checkAsmConstraintVal(Op, Constraint, Val)) {
17863 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17864 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17865 }
17866 } else {
17868 }
17869}
17870
17872 unsigned Size = Op.getScalarValueSizeInBits();
17873 if (Size > 64)
17874 return false;
17875
17876 if (Size == 16 && !Subtarget->has16BitInsts())
17877 return false;
17878
17880 Val = C->getSExtValue();
17881 return true;
17882 }
17884 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17885 return true;
17886 }
17888 if (Size != 16 || Op.getNumOperands() != 2)
17889 return false;
17890 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17891 return false;
17892 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17893 Val = C->getSExtValue();
17894 return true;
17895 }
17896 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17897 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17898 return true;
17899 }
17900 }
17901
17902 return false;
17903}
17904
17906 uint64_t Val) const {
17907 if (Constraint.size() == 1) {
17908 switch (Constraint[0]) {
17909 case 'I':
17911 case 'J':
17912 return isInt<16>(Val);
17913 case 'A':
17914 return checkAsmConstraintValA(Op, Val);
17915 case 'B':
17916 return isInt<32>(Val);
17917 case 'C':
17918 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17920 default:
17921 break;
17922 }
17923 } else if (Constraint.size() == 2) {
17924 if (Constraint == "DA") {
17925 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17926 int64_t LoBits = static_cast<int32_t>(Val);
17927 return checkAsmConstraintValA(Op, HiBits, 32) &&
17928 checkAsmConstraintValA(Op, LoBits, 32);
17929 }
17930 if (Constraint == "DB") {
17931 return true;
17932 }
17933 }
17934 llvm_unreachable("Invalid asm constraint");
17935}
17936
17938 unsigned MaxSize) const {
17939 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17940 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17941 if (Size == 16) {
17942 MVT VT = Op.getSimpleValueType();
17943 switch (VT.SimpleTy) {
17944 default:
17945 return false;
17946 case MVT::i16:
17947 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17948 case MVT::f16:
17949 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17950 case MVT::bf16:
17951 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17952 case MVT::v2i16:
17953 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17954 case MVT::v2f16:
17955 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17956 case MVT::v2bf16:
17957 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17958 }
17959 }
17960 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17961 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17962 return true;
17963 return false;
17964}
17965
17966static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17967 switch (UnalignedClassID) {
17968 case AMDGPU::VReg_64RegClassID:
17969 return AMDGPU::VReg_64_Align2RegClassID;
17970 case AMDGPU::VReg_96RegClassID:
17971 return AMDGPU::VReg_96_Align2RegClassID;
17972 case AMDGPU::VReg_128RegClassID:
17973 return AMDGPU::VReg_128_Align2RegClassID;
17974 case AMDGPU::VReg_160RegClassID:
17975 return AMDGPU::VReg_160_Align2RegClassID;
17976 case AMDGPU::VReg_192RegClassID:
17977 return AMDGPU::VReg_192_Align2RegClassID;
17978 case AMDGPU::VReg_224RegClassID:
17979 return AMDGPU::VReg_224_Align2RegClassID;
17980 case AMDGPU::VReg_256RegClassID:
17981 return AMDGPU::VReg_256_Align2RegClassID;
17982 case AMDGPU::VReg_288RegClassID:
17983 return AMDGPU::VReg_288_Align2RegClassID;
17984 case AMDGPU::VReg_320RegClassID:
17985 return AMDGPU::VReg_320_Align2RegClassID;
17986 case AMDGPU::VReg_352RegClassID:
17987 return AMDGPU::VReg_352_Align2RegClassID;
17988 case AMDGPU::VReg_384RegClassID:
17989 return AMDGPU::VReg_384_Align2RegClassID;
17990 case AMDGPU::VReg_512RegClassID:
17991 return AMDGPU::VReg_512_Align2RegClassID;
17992 case AMDGPU::VReg_1024RegClassID:
17993 return AMDGPU::VReg_1024_Align2RegClassID;
17994 case AMDGPU::AReg_64RegClassID:
17995 return AMDGPU::AReg_64_Align2RegClassID;
17996 case AMDGPU::AReg_96RegClassID:
17997 return AMDGPU::AReg_96_Align2RegClassID;
17998 case AMDGPU::AReg_128RegClassID:
17999 return AMDGPU::AReg_128_Align2RegClassID;
18000 case AMDGPU::AReg_160RegClassID:
18001 return AMDGPU::AReg_160_Align2RegClassID;
18002 case AMDGPU::AReg_192RegClassID:
18003 return AMDGPU::AReg_192_Align2RegClassID;
18004 case AMDGPU::AReg_256RegClassID:
18005 return AMDGPU::AReg_256_Align2RegClassID;
18006 case AMDGPU::AReg_512RegClassID:
18007 return AMDGPU::AReg_512_Align2RegClassID;
18008 case AMDGPU::AReg_1024RegClassID:
18009 return AMDGPU::AReg_1024_Align2RegClassID;
18010 default:
18011 return -1;
18012 }
18013}
18014
18015// Figure out which registers should be reserved for stack access. Only after
18016// the function is legalized do we know all of the non-spill stack objects or if
18017// calls are present.
18021 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18022 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18023 const SIInstrInfo *TII = ST.getInstrInfo();
18024
18025 if (Info->isEntryFunction()) {
18026 // Callable functions have fixed registers used for stack access.
18028 }
18029
18030 // TODO: Move this logic to getReservedRegs()
18031 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18032 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18033 Register SReg = ST.isWave32()
18034 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18035 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18036 &AMDGPU::SGPR_64RegClass);
18037 Info->setSGPRForEXECCopy(SReg);
18038
18039 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18040 Info->getStackPtrOffsetReg()));
18041 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18042 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18043
18044 // We need to worry about replacing the default register with itself in case
18045 // of MIR testcases missing the MFI.
18046 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18047 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18048
18049 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18050 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18051
18052 Info->limitOccupancy(MF);
18053
18054 if (ST.isWave32() && !MF.empty()) {
18055 for (auto &MBB : MF) {
18056 for (auto &MI : MBB) {
18057 TII->fixImplicitOperands(MI);
18058 }
18059 }
18060 }
18061
18062 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18063 // classes if required. Ideally the register class constraints would differ
18064 // per-subtarget, but there's no easy way to achieve that right now. This is
18065 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18066 // from using them as the register class for legal types.
18067 if (ST.needsAlignedVGPRs()) {
18068 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18069 const Register Reg = Register::index2VirtReg(I);
18070 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18071 if (!RC)
18072 continue;
18073 int NewClassID = getAlignedAGPRClassID(RC->getID());
18074 if (NewClassID != -1)
18075 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18076 }
18077 }
18078
18080}
18081
18083 KnownBits &Known,
18084 const APInt &DemandedElts,
18085 const SelectionDAG &DAG,
18086 unsigned Depth) const {
18087 Known.resetAll();
18088 unsigned Opc = Op.getOpcode();
18089 switch (Opc) {
18091 unsigned IID = Op.getConstantOperandVal(0);
18092 switch (IID) {
18093 case Intrinsic::amdgcn_mbcnt_lo:
18094 case Intrinsic::amdgcn_mbcnt_hi: {
18095 const GCNSubtarget &ST =
18097 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18098 // most 31 + src1.
18099 Known.Zero.setBitsFrom(
18100 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18101 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18102 Known = KnownBits::add(Known, Known2);
18103 return;
18104 }
18105 }
18106 break;
18107 }
18108 }
18110 Op, Known, DemandedElts, DAG, Depth);
18111}
18112
18114 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18116
18117 // Set the high bits to zero based on the maximum allowed scratch size per
18118 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18119 // calculation won't overflow, so assume the sign bit is never set.
18120 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18121}
18122
18124 GISelValueTracking &VT, KnownBits &Known,
18125 unsigned Dim) {
18126 unsigned MaxValue =
18127 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18128 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18129}
18130
18132 KnownBits &Known, const APInt &DemandedElts,
18133 unsigned BFEWidth, bool SExt, unsigned Depth) {
18135 const MachineOperand &Src1 = MI.getOperand(2);
18136
18137 unsigned Src1Cst = 0;
18138 if (Src1.isImm()) {
18139 Src1Cst = Src1.getImm();
18140 } else if (Src1.isReg()) {
18141 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18142 if (!Cst)
18143 return;
18144 Src1Cst = Cst->Value.getZExtValue();
18145 } else {
18146 return;
18147 }
18148
18149 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18150 // Width is always [22:16].
18151 const unsigned Offset =
18152 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18153 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18154
18155 if (Width >= BFEWidth) // Ill-formed.
18156 return;
18157
18158 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18159 Depth + 1);
18160
18161 Known = Known.extractBits(Width, Offset);
18162
18163 if (SExt)
18164 Known = Known.sext(BFEWidth);
18165 else
18166 Known = Known.zext(BFEWidth);
18167}
18168
18170 GISelValueTracking &VT, Register R, KnownBits &Known,
18171 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18172 unsigned Depth) const {
18173 Known.resetAll();
18174 const MachineInstr *MI = MRI.getVRegDef(R);
18175 switch (MI->getOpcode()) {
18176 case AMDGPU::S_BFE_I32:
18177 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18178 /*SExt=*/true, Depth);
18179 case AMDGPU::S_BFE_U32:
18180 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18181 /*SExt=*/false, Depth);
18182 case AMDGPU::S_BFE_I64:
18183 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18184 /*SExt=*/true, Depth);
18185 case AMDGPU::S_BFE_U64:
18186 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18187 /*SExt=*/false, Depth);
18188 case AMDGPU::G_INTRINSIC:
18189 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18190 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18191 switch (IID) {
18192 case Intrinsic::amdgcn_workitem_id_x:
18193 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18194 break;
18195 case Intrinsic::amdgcn_workitem_id_y:
18196 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18197 break;
18198 case Intrinsic::amdgcn_workitem_id_z:
18199 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18200 break;
18201 case Intrinsic::amdgcn_mbcnt_lo:
18202 case Intrinsic::amdgcn_mbcnt_hi: {
18203 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18204 // most 31 + src1.
18205 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18206 ? getSubtarget()->getWavefrontSizeLog2()
18207 : 5);
18208 KnownBits Known2;
18209 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18210 Depth + 1);
18211 Known = KnownBits::add(Known, Known2);
18212 break;
18213 }
18214 case Intrinsic::amdgcn_groupstaticsize: {
18215 // We can report everything over the maximum size as 0. We can't report
18216 // based on the actual size because we don't know if it's accurate or not
18217 // at any given point.
18218 Known.Zero.setHighBits(
18219 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18220 break;
18221 }
18222 }
18223 break;
18224 }
18225 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18226 Known.Zero.setHighBits(24);
18227 break;
18228 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18229 Known.Zero.setHighBits(16);
18230 break;
18231 case AMDGPU::G_AMDGPU_SMED3:
18232 case AMDGPU::G_AMDGPU_UMED3: {
18233 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18234
18235 KnownBits Known2;
18236 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18237 if (Known2.isUnknown())
18238 break;
18239
18240 KnownBits Known1;
18241 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18242 if (Known1.isUnknown())
18243 break;
18244
18245 KnownBits Known0;
18246 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18247 if (Known0.isUnknown())
18248 break;
18249
18250 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18251 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18252 Known.One = Known0.One & Known1.One & Known2.One;
18253 break;
18254 }
18255 }
18256}
18257
18260 unsigned Depth) const {
18261 const MachineInstr *MI = MRI.getVRegDef(R);
18262 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18263 // FIXME: Can this move to generic code? What about the case where the call
18264 // site specifies a lower alignment?
18265 Intrinsic::ID IID = GI->getIntrinsicID();
18267 AttributeList Attrs =
18268 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18269 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18270 return *RetAlign;
18271 }
18272 return Align(1);
18273}
18274
18277 const Align CacheLineAlign = Align(64);
18278
18279 // Pre-GFX10 target did not benefit from loop alignment
18280 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18281 getSubtarget()->hasInstFwdPrefetchBug())
18282 return PrefAlign;
18283
18284 // On GFX10 I$ is 4 x 64 bytes cache lines.
18285 // By default prefetcher keeps one cache line behind and reads two ahead.
18286 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18287 // behind and one ahead.
18288 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18289 // If loop fits 64 bytes it always spans no more than two cache lines and
18290 // does not need an alignment.
18291 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18292 // Else if loop is less or equal 192 bytes we need two lines behind.
18293
18295 const MachineBasicBlock *Header = ML->getHeader();
18296 if (Header->getAlignment() != PrefAlign)
18297 return Header->getAlignment(); // Already processed.
18298
18299 unsigned LoopSize = 0;
18300 for (const MachineBasicBlock *MBB : ML->blocks()) {
18301 // If inner loop block is aligned assume in average half of the alignment
18302 // size to be added as nops.
18303 if (MBB != Header)
18304 LoopSize += MBB->getAlignment().value() / 2;
18305
18306 for (const MachineInstr &MI : *MBB) {
18307 LoopSize += TII->getInstSizeInBytes(MI);
18308 if (LoopSize > 192)
18309 return PrefAlign;
18310 }
18311 }
18312
18313 if (LoopSize <= 64)
18314 return PrefAlign;
18315
18316 if (LoopSize <= 128)
18317 return CacheLineAlign;
18318
18319 // If any of parent loops is surrounded by prefetch instructions do not
18320 // insert new for inner loop, which would reset parent's settings.
18321 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18322 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18323 auto I = Exit->getFirstNonDebugInstr();
18324 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18325 return CacheLineAlign;
18326 }
18327 }
18328
18329 MachineBasicBlock *Pre = ML->getLoopPreheader();
18330 MachineBasicBlock *Exit = ML->getExitBlock();
18331
18332 if (Pre && Exit) {
18333 auto PreTerm = Pre->getFirstTerminator();
18334 if (PreTerm == Pre->begin() ||
18335 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18336 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18337 .addImm(1); // prefetch 2 lines behind PC
18338
18339 auto ExitHead = Exit->getFirstNonDebugInstr();
18340 if (ExitHead == Exit->end() ||
18341 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18342 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18343 .addImm(2); // prefetch 1 line behind PC
18344 }
18345
18346 return CacheLineAlign;
18347}
18348
18349[[maybe_unused]]
18350static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18351 assert(N->getOpcode() == ISD::CopyFromReg);
18352 do {
18353 // Follow the chain until we find an INLINEASM node.
18354 N = N->getOperand(0).getNode();
18355 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18356 return true;
18357 } while (N->getOpcode() == ISD::CopyFromReg);
18358 return false;
18359}
18360
18363 UniformityInfo *UA) const {
18364 switch (N->getOpcode()) {
18365 case ISD::CopyFromReg: {
18366 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18367 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18368 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18369 Register Reg = R->getReg();
18370
18371 // FIXME: Why does this need to consider isLiveIn?
18372 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18373 return !TRI->isSGPRReg(MRI, Reg);
18374
18375 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18376 return UA->isDivergent(V);
18377
18379 return !TRI->isSGPRReg(MRI, Reg);
18380 }
18381 case ISD::LOAD: {
18382 const LoadSDNode *L = cast<LoadSDNode>(N);
18383 unsigned AS = L->getAddressSpace();
18384 // A flat load may access private memory.
18386 }
18387 case ISD::CALLSEQ_END:
18388 return true;
18390 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18392 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18393 case AMDGPUISD::ATOMIC_CMP_SWAP:
18394 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18395 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18396 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18397 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18398 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18399 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18400 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18401 case AMDGPUISD::BUFFER_ATOMIC_AND:
18402 case AMDGPUISD::BUFFER_ATOMIC_OR:
18403 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18404 case AMDGPUISD::BUFFER_ATOMIC_INC:
18405 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18406 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18407 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18408 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18409 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18410 // Target-specific read-modify-write atomics are sources of divergence.
18411 return true;
18412 default:
18413 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18414 // Generic read-modify-write atomics are sources of divergence.
18415 return A->readMem() && A->writeMem();
18416 }
18417 return false;
18418 }
18419}
18420
18422 EVT VT) const {
18423 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18424 case MVT::f32:
18426 case MVT::f64:
18427 case MVT::f16:
18429 default:
18430 return false;
18431 }
18432}
18433
18435 LLT Ty, const MachineFunction &MF) const {
18436 switch (Ty.getScalarSizeInBits()) {
18437 case 32:
18438 return !denormalModeIsFlushAllF32(MF);
18439 case 64:
18440 case 16:
18441 return !denormalModeIsFlushAllF64F16(MF);
18442 default:
18443 return false;
18444 }
18445}
18446
18448 const APInt &DemandedElts,
18449 const SelectionDAG &DAG,
18450 bool SNaN,
18451 unsigned Depth) const {
18452 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18453 const MachineFunction &MF = DAG.getMachineFunction();
18455
18456 if (Info->getMode().DX10Clamp)
18457 return true; // Clamped to 0.
18458 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18459 }
18460
18462 DAG, SNaN, Depth);
18463}
18464
18465// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18466// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18468 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18469 return true;
18470
18471 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
18472 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18473 if (DenormMode == DenormalMode::getPreserveSign())
18474 return true;
18475
18476 // TODO: Remove this.
18477 return RMW->getFunction()
18478 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18479 .getValueAsBool();
18480}
18481
18483 LLVMContext &Ctx = RMW->getContext();
18484 StringRef MemScope =
18485 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18486
18487 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18488 << "Hardware instruction generated for atomic "
18489 << RMW->getOperationName(RMW->getOperation())
18490 << " operation at memory scope " << MemScope;
18491}
18492
18493static bool isV2F16OrV2BF16(Type *Ty) {
18494 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18495 Type *EltTy = VT->getElementType();
18496 return VT->getNumElements() == 2 &&
18497 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18498 }
18499
18500 return false;
18501}
18502
18503static bool isV2F16(Type *Ty) {
18505 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18506}
18507
18508static bool isV2BF16(Type *Ty) {
18510 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18511}
18512
18513/// \return true if atomicrmw integer ops work for the type.
18514static bool isAtomicRMWLegalIntTy(Type *Ty) {
18515 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18516 unsigned BW = IT->getBitWidth();
18517 return BW == 32 || BW == 64;
18518 }
18519
18520 return false;
18521}
18522
18523/// \return true if this atomicrmw xchg type can be selected.
18524static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18525 Type *Ty = RMW->getType();
18526 if (isAtomicRMWLegalIntTy(Ty))
18527 return true;
18528
18529 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18530 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18531 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18532 return BW == 32 || BW == 64;
18533 }
18534
18535 if (Ty->isFloatTy() || Ty->isDoubleTy())
18536 return true;
18537
18539 return VT->getNumElements() == 2 &&
18540 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18541 }
18542
18543 return false;
18544}
18545
18546/// \returns true if it's valid to emit a native instruction for \p RMW, based
18547/// on the properties of the target memory.
18548static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18549 const AtomicRMWInst *RMW,
18550 bool HasSystemScope) {
18551 // The remote/fine-grained access logic is different from the integer
18552 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18553 // fine-grained access does not work, even for a device local allocation.
18554 //
18555 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18556 // allocations work.
18557 if (HasSystemScope) {
18559 RMW->hasMetadata("amdgpu.no.remote.memory"))
18560 return true;
18561 if (Subtarget.hasEmulatedSystemScopeAtomics())
18562 return true;
18564 return true;
18565
18566 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18567}
18568
18569/// \return Action to perform on AtomicRMWInsts for integer operations.
18576
18577/// Return if a flat address space atomicrmw can access private memory.
18579 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18580 return !MD ||
18582}
18583
18591
18594 unsigned AS = RMW->getPointerAddressSpace();
18595 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18597
18598 // 64-bit flat atomics that dynamically reside in private memory will silently
18599 // be dropped.
18600 //
18601 // Note that we will emit a new copy of the original atomic in the expansion,
18602 // which will be incrementally relegalized.
18603 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18604 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18605 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18608
18609 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18611 ORE.emit([=]() {
18612 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18613 });
18614 return Kind;
18615 };
18616
18617 auto SSID = RMW->getSyncScopeID();
18618 bool HasSystemScope =
18619 SSID == SyncScope::System ||
18620 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18621
18622 auto Op = RMW->getOperation();
18623 switch (Op) {
18625 // PCIe supports add and xchg for system atomics.
18626 return isAtomicRMWLegalXChgTy(RMW)
18629 case AtomicRMWInst::Add:
18630 // PCIe supports add and xchg for system atomics.
18632 case AtomicRMWInst::Sub:
18633 case AtomicRMWInst::And:
18634 case AtomicRMWInst::Or:
18635 case AtomicRMWInst::Xor:
18636 case AtomicRMWInst::Max:
18637 case AtomicRMWInst::Min:
18644 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
18646 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
18649 auto *IT = dyn_cast<IntegerType>(RMW->getType());
18650 if (!IT || IT->getBitWidth() != 32)
18652 }
18653
18656 if (Subtarget->hasEmulatedSystemScopeAtomics())
18658
18659 // On most subtargets, for atomicrmw operations other than add/xchg,
18660 // whether or not the instructions will behave correctly depends on where
18661 // the address physically resides and what interconnect is used in the
18662 // system configuration. On some some targets the instruction will nop,
18663 // and in others synchronization will only occur at degraded device scope.
18664 //
18665 // If the allocation is known local to the device, the instructions should
18666 // work correctly.
18667 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18669
18670 // If fine-grained remote memory works at device scope, we don't need to
18671 // do anything.
18672 if (!HasSystemScope &&
18673 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18675
18676 // If we are targeting a remote allocated address, it depends what kind of
18677 // allocation the address belongs to.
18678 //
18679 // If the allocation is fine-grained (in host memory, or in PCIe peer
18680 // device memory), the operation will fail depending on the target.
18681 //
18682 // Note fine-grained host memory access does work on APUs or if XGMI is
18683 // used, but we do not know if we are targeting an APU or the system
18684 // configuration from the ISA version/target-cpu.
18685 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18687
18690 // Atomic sub/or/xor do not work over PCI express, but atomic add
18691 // does. InstCombine transforms these with 0 to or, so undo that.
18692 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18693 ConstVal && ConstVal->isNullValue())
18695 }
18696
18697 // If the allocation could be in remote, fine-grained memory, the rmw
18698 // instructions may fail. cmpxchg should work, so emit that. On some
18699 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18700 // even work, so you're out of luck anyway.
18701
18702 // In summary:
18703 //
18704 // Cases that may fail:
18705 // - fine-grained pinned host memory
18706 // - fine-grained migratable host memory
18707 // - fine-grained PCIe peer device
18708 //
18709 // Cases that should work, but may be treated overly conservatively.
18710 // - fine-grained host memory on an APU
18711 // - fine-grained XGMI peer device
18713 }
18714
18716 }
18717 case AtomicRMWInst::FAdd: {
18718 Type *Ty = RMW->getType();
18719
18720 // TODO: Handle REGION_ADDRESS
18721 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18722 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18723 // is fixed to round-to-nearest-even.
18724 //
18725 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18726 // round-to-nearest-even.
18727 //
18728 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18729 // suggests it is OK if the floating-point mode may not match the calling
18730 // thread.
18731 if (Ty->isFloatTy()) {
18732 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18734 }
18735
18736 if (Ty->isDoubleTy()) {
18737 // Ignores denormal mode, but we don't consider flushing mandatory.
18738 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18740 }
18741
18742 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18744
18746 }
18747
18748 // LDS atomics respect the denormal mode from the mode register.
18749 //
18750 // Traditionally f32 global/buffer memory atomics would unconditionally
18751 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18752 // flush.
18753 //
18754 // On targets with flat atomic fadd, denormals would flush depending on
18755 // whether the target address resides in LDS or global memory. We consider
18756 // this flat-maybe-flush as will-flush.
18757 if (Ty->isFloatTy() &&
18758 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18761
18762 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18763 // safe. The message phrasing also should be better.
18764 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18765 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18766 // gfx942, gfx12
18767 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18768 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18769 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18770 // gfx90a, gfx942, gfx12
18771 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18772 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18773
18774 // gfx942, gfx12
18775 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18776 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18777 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18778 // gfx90a, gfx942, gfx12
18779 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18780 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18781
18782 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18783 // buffer. gfx12 does have the buffer version.
18784 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18785 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18786 }
18787
18788 // global and flat atomic fadd f64: gfx90a, gfx942.
18789 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18790 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18791
18792 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18793 if (Ty->isFloatTy()) {
18794 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18795 // gfx11+.
18796 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18797 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18798 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18799 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18800 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18801 } else {
18802 // gfx908
18803 if (RMW->use_empty() &&
18804 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18805 isV2F16(Ty))
18806 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18807 }
18808 }
18809
18810 // flat atomic fadd f32: gfx942, gfx11+.
18811 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18812 if (Subtarget->hasFlatAtomicFaddF32Inst())
18813 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18814
18815 // If it is in flat address space, and the type is float, we will try to
18816 // expand it, if the target supports global and lds atomic fadd. The
18817 // reason we need that is, in the expansion, we emit the check of
18818 // address space. If it is in global address space, we emit the global
18819 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18820 // fadd.
18821 if (Subtarget->hasLDSFPAtomicAddF32()) {
18822 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18824 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18826 }
18827 }
18828 }
18829
18831 }
18833 case AtomicRMWInst::FMax: {
18834 Type *Ty = RMW->getType();
18835
18836 // LDS float and double fmin/fmax were always supported.
18837 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18838 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18840 }
18841
18842 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18843 // For flat and global cases:
18844 // float, double in gfx7. Manual claims denormal support.
18845 // Removed in gfx8.
18846 // float, double restored in gfx10.
18847 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18848 //
18849 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18850 // no f32.
18851 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18852 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18853 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18854 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18855 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18856 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18858 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18859 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18860 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18861 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18862 }
18863 }
18864
18866 }
18869 default:
18871 }
18872
18873 llvm_unreachable("covered atomicrmw op switch");
18874}
18875
18882
18889
18892 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18893 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18895
18896 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18898
18899 const DataLayout &DL = CmpX->getDataLayout();
18900
18901 Type *ValTy = CmpX->getNewValOperand()->getType();
18902
18903 // If a 64-bit flat atomic may alias private, we need to avoid using the
18904 // atomic in the private case.
18905 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18907}
18908
18909const TargetRegisterClass *
18910SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18912 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18913 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18914 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18915 : &AMDGPU::SReg_32RegClass;
18916 if (!TRI->isSGPRClass(RC) && !isDivergent)
18917 return TRI->getEquivalentSGPRClass(RC);
18918 if (TRI->isSGPRClass(RC) && isDivergent) {
18919 if (Subtarget->hasGFX90AInsts())
18920 return TRI->getEquivalentAVClass(RC);
18921 return TRI->getEquivalentVGPRClass(RC);
18922 }
18923
18924 return RC;
18925}
18926
18927// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18928// uniform values (as produced by the mask results of control flow intrinsics)
18929// used outside of divergent blocks. The phi users need to also be treated as
18930// always uniform.
18931//
18932// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18933static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18934 unsigned WaveSize) {
18935 // FIXME: We assume we never cast the mask results of a control flow
18936 // intrinsic.
18937 // Early exit if the type won't be consistent as a compile time hack.
18938 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18939 if (!IT || IT->getBitWidth() != WaveSize)
18940 return false;
18941
18942 if (!isa<Instruction>(V))
18943 return false;
18944 if (!Visited.insert(V).second)
18945 return false;
18946 bool Result = false;
18947 for (const auto *U : V->users()) {
18949 if (V == U->getOperand(1)) {
18950 switch (Intrinsic->getIntrinsicID()) {
18951 default:
18952 Result = false;
18953 break;
18954 case Intrinsic::amdgcn_if_break:
18955 case Intrinsic::amdgcn_if:
18956 case Intrinsic::amdgcn_else:
18957 Result = true;
18958 break;
18959 }
18960 }
18961 if (V == U->getOperand(0)) {
18962 switch (Intrinsic->getIntrinsicID()) {
18963 default:
18964 Result = false;
18965 break;
18966 case Intrinsic::amdgcn_end_cf:
18967 case Intrinsic::amdgcn_loop:
18968 Result = true;
18969 break;
18970 }
18971 }
18972 } else {
18973 Result = hasCFUser(U, Visited, WaveSize);
18974 }
18975 if (Result)
18976 break;
18977 }
18978 return Result;
18979}
18980
18982 const Value *V) const {
18983 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18984 if (CI->isInlineAsm()) {
18985 // FIXME: This cannot give a correct answer. This should only trigger in
18986 // the case where inline asm returns mixed SGPR and VGPR results, used
18987 // outside the defining block. We don't have a specific result to
18988 // consider, so this assumes if any value is SGPR, the overall register
18989 // also needs to be SGPR.
18990 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18992 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18993 for (auto &TC : TargetConstraints) {
18994 if (TC.Type == InlineAsm::isOutput) {
18996 const TargetRegisterClass *RC =
18997 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18998 TC.ConstraintVT)
18999 .second;
19000 if (RC && SIRI->isSGPRClass(RC))
19001 return true;
19002 }
19003 }
19004 }
19005 }
19007 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19008}
19009
19011 for (SDUse &Use : N->uses()) {
19013 if (getBasePtrIndex(M) == Use.getOperandNo())
19014 return true;
19015 }
19016 }
19017 return false;
19018}
19019
19021 SDValue N1) const {
19022 if (!N0.hasOneUse())
19023 return false;
19024 // Take care of the opportunity to keep N0 uniform
19025 if (N0->isDivergent() || !N1->isDivergent())
19026 return true;
19027 // Check if we have a good chance to form the memory access pattern with the
19028 // base and offset
19029 return (DAG.isBaseWithConstantOffset(N0) &&
19031}
19032
19034 Register N0, Register N1) const {
19035 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
19036}
19037
19040 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19042 if (I.getMetadata("amdgpu.noclobber"))
19043 Flags |= MONoClobber;
19044 if (I.getMetadata("amdgpu.last.use"))
19045 Flags |= MOLastUse;
19046 return Flags;
19047}
19048
19050 Instruction *AI) const {
19051 // Given: atomicrmw fadd ptr %addr, float %val ordering
19052 //
19053 // With this expansion we produce the following code:
19054 // [...]
19055 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19056 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19057 //
19058 // atomicrmw.shared:
19059 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19060 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19061 // float %val ordering
19062 // br label %atomicrmw.phi
19063 //
19064 // atomicrmw.check.private:
19065 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19066 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19067 //
19068 // atomicrmw.private:
19069 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19070 // %loaded.private = load float, ptr addrspace(5) %cast.private
19071 // %val.new = fadd float %loaded.private, %val
19072 // store float %val.new, ptr addrspace(5) %cast.private
19073 // br label %atomicrmw.phi
19074 //
19075 // atomicrmw.global:
19076 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19077 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19078 // float %val ordering
19079 // br label %atomicrmw.phi
19080 //
19081 // atomicrmw.phi:
19082 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19083 // [ %loaded.private, %atomicrmw.private ],
19084 // [ %loaded.global, %atomicrmw.global ]
19085 // br label %atomicrmw.end
19086 //
19087 // atomicrmw.end:
19088 // [...]
19089 //
19090 //
19091 // For 64-bit atomics which may reside in private memory, we perform a simpler
19092 // version that only inserts the private check, and uses the flat operation.
19093
19094 IRBuilder<> Builder(AI);
19095 LLVMContext &Ctx = Builder.getContext();
19096
19097 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19098 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19100 Value *Addr = AI->getOperand(PtrOpIdx);
19101
19102 /// TODO: Only need to check private, then emit flat-known-not private (no
19103 /// need for shared block, or cast to global).
19105
19106 Align Alignment;
19107 if (RMW)
19108 Alignment = RMW->getAlign();
19109 else if (CX)
19110 Alignment = CX->getAlign();
19111 else
19112 llvm_unreachable("unhandled atomic operation");
19113
19114 // FullFlatEmulation is true if we need to issue the private, shared, and
19115 // global cases.
19116 //
19117 // If this is false, we are only dealing with the flat-targeting-private case,
19118 // where we only insert a check for private and still use the flat instruction
19119 // for global and shared.
19120
19121 bool FullFlatEmulation =
19122 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19123 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19124 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19125 RMW->getType()->isDoubleTy()));
19126
19127 // If the return value isn't used, do not introduce a false use in the phi.
19128 bool ReturnValueIsUsed = !AI->use_empty();
19129
19130 BasicBlock *BB = Builder.GetInsertBlock();
19131 Function *F = BB->getParent();
19132 BasicBlock *ExitBB =
19133 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19134 BasicBlock *SharedBB = nullptr;
19135
19136 BasicBlock *CheckPrivateBB = BB;
19137 if (FullFlatEmulation) {
19138 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19139 CheckPrivateBB =
19140 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19141 }
19142
19143 BasicBlock *PrivateBB =
19144 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19145 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19146 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19147
19148 std::prev(BB->end())->eraseFromParent();
19149 Builder.SetInsertPoint(BB);
19150
19151 Value *LoadedShared = nullptr;
19152 if (FullFlatEmulation) {
19153 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19154 {Addr}, nullptr, "is.shared");
19155 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19156 Builder.SetInsertPoint(SharedBB);
19157 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19159
19160 Instruction *Clone = AI->clone();
19161 Clone->insertInto(SharedBB, SharedBB->end());
19162 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19163 LoadedShared = Clone;
19164
19165 Builder.CreateBr(PhiBB);
19166 Builder.SetInsertPoint(CheckPrivateBB);
19167 }
19168
19169 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19170 {Addr}, nullptr, "is.private");
19171 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19172
19173 Builder.SetInsertPoint(PrivateBB);
19174
19175 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19177
19178 Value *LoadedPrivate;
19179 if (RMW) {
19180 LoadedPrivate = Builder.CreateAlignedLoad(
19181 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19182
19183 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19184 LoadedPrivate, RMW->getValOperand());
19185
19186 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19187 } else {
19188 auto [ResultLoad, Equal] =
19189 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19190 CX->getNewValOperand(), CX->getAlign());
19191
19192 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19193 ResultLoad, 0);
19194 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19195 }
19196
19197 Builder.CreateBr(PhiBB);
19198
19199 Builder.SetInsertPoint(GlobalBB);
19200
19201 // Continue using a flat instruction if we only emitted the check for private.
19202 Instruction *LoadedGlobal = AI;
19203 if (FullFlatEmulation) {
19204 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19206 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19207 }
19208
19209 AI->removeFromParent();
19210 AI->insertInto(GlobalBB, GlobalBB->end());
19211
19212 // The new atomicrmw may go through another round of legalization later.
19213 if (!FullFlatEmulation) {
19214 // We inserted the runtime check already, make sure we do not try to
19215 // re-expand this.
19216 // TODO: Should union with any existing metadata.
19217 MDBuilder MDB(F->getContext());
19218 MDNode *RangeNotPrivate =
19221 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19222 RangeNotPrivate);
19223 }
19224
19225 Builder.CreateBr(PhiBB);
19226
19227 Builder.SetInsertPoint(PhiBB);
19228
19229 if (ReturnValueIsUsed) {
19230 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19231 AI->replaceAllUsesWith(Loaded);
19232 if (FullFlatEmulation)
19233 Loaded->addIncoming(LoadedShared, SharedBB);
19234 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19235 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19236 Loaded->takeName(AI);
19237 }
19238
19239 Builder.CreateBr(ExitBB);
19240}
19241
19243 unsigned PtrOpIdx) {
19244 Value *PtrOp = I->getOperand(PtrOpIdx);
19247
19248 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19249 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19250 I->getIterator());
19251 I->setOperand(PtrOpIdx, ASCast);
19252}
19253
19256
19259
19262 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19263 ConstVal && ConstVal->isNullValue()) {
19264 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19266
19267 // We may still need the private-alias-flat handling below.
19268
19269 // TODO: Skip this for cases where we cannot access remote memory.
19270 }
19271 }
19272
19273 // The non-flat expansions should only perform the de-canonicalization of
19274 // identity values.
19276 return;
19277
19279}
19280
19287
19291
19293 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19294}
19295
19297 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19298 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19299
19301 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19302}
19303
19304LoadInst *
19306 IRBuilder<> Builder(AI);
19307 auto Order = AI->getOrdering();
19308
19309 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19310 // must be flushed if the atomic ordering had a release semantics. This is
19311 // not necessary a fence, a release fence just coincides to do that flush.
19312 // Avoid replacing of an atomicrmw with a release semantics.
19313 if (isReleaseOrStronger(Order))
19314 return nullptr;
19315
19316 LoadInst *LI = Builder.CreateAlignedLoad(
19317 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19318 LI->setAtomic(Order, AI->getSyncScopeID());
19319 LI->copyMetadata(*AI);
19320 LI->takeName(AI);
19321 AI->replaceAllUsesWith(LI);
19322 AI->eraseFromParent();
19323 return LI;
19324}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1258
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1255
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1102
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6053
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
bool isNegative() const
Definition APFloat.h:1431
bool isNormal() const
Definition APFloat.h:1435
APInt bitcastToAPInt() const
Definition APFloat.h:1335
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1120
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1061
bool isInfinity() const
Definition APFloat.h:1428
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1640
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:222
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:215
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:806
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:226
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:223
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:423
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:153
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ ATOMIC_LOAD_USUB_COND
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition ISDOpcodes.h:991
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:981
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ ATOMIC_LOAD_USUB_SAT
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:963
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:541
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:966
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:985
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2157
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301
constexpr bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
constexpr bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1748
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1779
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1918
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs